## Data Preprocessing

In [1]:
#Importing the libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#Importing the dataset
dataset = pd.read_csv("helpinghands_dataset.csv")

In [3]:
dataset.head()

Unnamed: 0,YEAR,MONTH,STATNAME,STATCD,DISTNAME,DISTCD,EVENT_NAME,EVENT_CODE
0,2013,1,ANDHRA PRADESH,28,PRAKASAM,2818,poverty,1
1,2013,8,ASSAM,18,UDALGURI,1826,education,3
2,2013,9,PUNJAB,3,HOSHIARPUR,305,education,3
3,2013,8,TELANGANA,36,NALGONDA,3608,education,3
4,2013,4,KARNATAKA,29,KODAGU,2925,donations,4


In [4]:
dataset.isnull().sum()

YEAR          0
MONTH         0
STATNAME      0
STATCD        0
DISTNAME      0
DISTCD        0
EVENT_NAME    0
EVENT_CODE    0
dtype: int64

In [5]:
#Independent features
X = dataset.iloc[:, [1,5]].values

In [6]:
X

array([[   1, 2818],
       [   8, 1826],
       [   9,  305],
       ...,
       [   2, 1202],
       [   4, 2811],
       [   2, 1005]])

In [7]:
print X.shape

(80000, 2)


In [8]:
#dependent variable vector
y = dataset.iloc[:, 7].values

In [9]:
y

array([1, 3, 3, ..., 1, 1, 1])

In [10]:
#Encode categorical data

from sklearn.preprocessing import OneHotEncoderEncoder
onehotencoder = OneHotEncoder(categorical_features = [0,1])


In [11]:
def encode_data(X):
    X = onehotencoder.fit_transform(X).toarray()
    return X

In [12]:
df_X = pd.DataFrame(data=X)
df_y = pd.DataFrame(data=y)

In [13]:
df_X.head()

Unnamed: 0,0,1
0,1,2818
1,8,1826
2,9,305
3,8,3608
4,4,2925


In [14]:
print X.shape

(80000, 2)


In [15]:
X = encode_data(X)

In [16]:
print X.shape

(80000, 112)


In [17]:
#Splitting the dataset into training and test set

from sklearn.cross_validation import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X, y, test_size=0.2 , random_state=0)



In [18]:
print X_train.shape
print X_test.shape
print y_train.shape
print y_test.shape

(64000, 112)
(16000, 112)
(64000,)
(16000,)


In [19]:
X_train , X_test , y_train , y_test

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 1.]]), array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]), array([1, 4, 1, ..., 1, 1, 1]), array([1, 2, 1, ..., 1, 1, 1]))

In [20]:
#Feature Scaling

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()

def get_scaled_data(X_train, X_test):
    X_train = sc_X.fit_transform(X_train)
    X_test = sc_X.transform(X_test)
    return X_train, X_test

In [21]:
X_train, X_test = get_scaled_data(X_train, X_test)

In [22]:
X_train

array([[-0.31017658, -0.30481935, -0.31087222, ..., -0.09858266,
        -0.10121517, -0.09954767],
       [-0.31017658, -0.30481935, -0.31087222, ..., -0.09858266,
        -0.10121517, -0.09954767],
       [-0.31017658, -0.30481935, -0.31087222, ..., -0.09858266,
        -0.10121517, -0.09954767],
       ...,
       [-0.31017658, -0.30481935,  3.21675578, ..., -0.09858266,
        -0.10121517, -0.09954767],
       [-0.31017658, -0.30481935, -0.31087222, ..., -0.09858266,
        -0.10121517, -0.09954767],
       [-0.31017658, -0.30481935, -0.31087222, ..., -0.09858266,
        -0.10121517, 10.04543817]])

## Fitting KNN classifier to the training set

In [23]:
from sklearn.neighbors import KNeighborsClassifier

In [24]:
def fit_model(X_train, y_train):
    model_knn = KNeighborsClassifier(n_neighbors=8, metric='minkowski' , p=2)
    model_knn.fit(X_train , y_train)
    return model_knn

In [25]:
model_knn = fit_model(X_train, y_train)

In [26]:
y_pred = model_knn.predict(X_test)

In [27]:
print X_test.shape

(16000, 112)


In [28]:
print y_pred.shape

(16000,)


In [29]:
y_pred[:5]

array([1, 1, 1, 1, 1])

In [30]:
# saving the model

import pickle
from sklearn.externals import joblib
filename = 'knn_model_5.pkl'
joblib.dump(model_knn, filename)

['knn_model_5.pkl']

In [31]:
# load the model from disk

loaded_model = joblib.load(filename)
score = loaded_model.score(X_test, y_test)
print("Test score: {0:.2f} %".format(100 * score))

Test score: 65.99 %


In [32]:
print X_test.dtype

float64


## Tesing on 2017 data

In [None]:
X_2016 = pd.read_csv("dataset_2012.csv")

In [None]:
X_2016.head()

In [None]:
X_2016_data = X_2016.iloc[:, [1,5]].values

In [None]:
X_2016_data.shape

In [None]:
k = 1000
def make_prediction_data() :
    count=0
    for row in range (0, 10000, k) :
        x_write = []
        count+=1
        X = X_2016_data[row:row+k, :]
        for i in xrange(k):   
            x_write.append(list(X_2016.iloc[i, :].values))
        X_encoded = encode_data(X)
        _ , X_test_2016 = get_scaled_data(X_train, X_encoded)
        predict_data(X_test_2016, x_write, k)


event_dict = {1: 'poverty', 2: 'health_care', 3: 'education', 4: 'donations'}

def predict_data(x_test, x_file, k) :
    
    y_pred = model_knn.predict(x_test)
        
    for i in range(k) :
        event = event_dict[y_pred[i]]
        x_file[i].append(event)
        x_file[i].append(y_pred[i])
        write_to_file(x_file[i])

import csv
def write_to_file(row) :
        with open('helpinghands_dataset.csv', 'a') as csvFile:
            writer = csv.writer(csvFile)
            writer.writerow(row)
        csvFile.close()

In [None]:
make_prediction_data()

In [None]:
from sklearn.cross_validation import train_test_split
def show_prediction_info():
    pd.read_csv("helpinghands_dataset.csv")
    X = dataset.iloc[:, [1,5]].values
    y = dataset.iloc[:, 7].values
    print X.shape
    print y.shape
    X = encode_data(X)
    X_train , X_test , y_train , y_test = train_test_split(X, y, test_size=0.2 , random_state=0)
    print X_train.shape
    print X_test.shape
    print y_train.shape
    print y_test.shape
    X_train, X_test = get_scaled_data(X_train, X_test)
    model = fit_model(X_train, y_train)
    y_pred = model_knn.predict(X_test)
    score = loaded_model.score(X_test, y_test)
    print("Test score: {0:.2f} %".format(100 * score))
    

In [None]:
show_prediction_info()

# Fitting SVM classifier to the training set

In [33]:
from sklearn.svm import SVC
classifier = SVC(kernel='linear' , random_state=0)
classifier.fit(X_train , y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [34]:
y_pred = classifier.predict(X_test)

In [35]:
# saving the model

import pickle
from sklearn.externals import joblib
filename = 'svm_model.pkl'
joblib.dump(classifier, filename)

['svm_model.pkl']

In [36]:
# load the model from disk

loaded_model = joblib.load(filename)
score = loaded_model.score(X_test, y_test)
print("Test score: {0:.2f} %".format(100 * score))

Test score: 67.81 %


# Fitting Decision Tree classifier to the training set

In [37]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion='entropy' , random_state=0)
classifier.fit(X_train , y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [38]:
y_pred = classifier.predict(X_test)

In [39]:
# saving the model

import pickle
from sklearn.externals import joblib
filename = 'decision_tree_model.pkl'
joblib.dump(classifier, filename)

['decision_tree_model.pkl']

In [40]:
# load the model from disk

loaded_model = joblib.load(filename)
score = loaded_model.score(X_test, y_test)
print("Test score: {0:.2f} %".format(100 * score))

Test score: 67.74 %


# Fitting Random Forest classifier to the training set 

In [41]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10 , criterion='entropy' , random_state=0)
classifier.fit(X_train , y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [42]:
y_pred = classifier.predict(X_test)

In [43]:
# saving the model

import pickle
from sklearn.externals import joblib
filename = 'rf_model.pkl'
joblib.dump(classifier, filename)

['rf_model.pkl']

In [44]:
# load the model from disk

loaded_model = joblib.load(filename)
score = loaded_model.score(X_test, y_test)
print("Test score: {0:.2f} %".format(100 * score))

Test score: 67.69 %


# Fitting Naive Bayes classifier to the training set

In [45]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None)

In [46]:
y_pred = classifier.predict(X_test)

In [47]:
# saving the model

import pickle
from sklearn.externals import joblib
filename = 'nb_model.pkl'
joblib.dump(classifier, filename)

['nb_model.pkl']

In [48]:
# load the model from disk

loaded_model = joblib.load(filename)
score = loaded_model.score(X_test, y_test)
print("Test score: {0:.2f} %".format(100 * score))

Test score: 30.36 %
