## Data Preprocessing

In [3]:
#Importing the libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
#Importing the dataset
dataset = pd.read_csv("helpinghands_dataset.csv")

In [5]:
dataset.head()

Unnamed: 0,YEAR,MONTH,STATNAME,STATCD,DISTNAME,DISTCD,EVENT_NAME,EVENT_CODE
0,2013,1,ANDHRA PRADESH,28,PRAKASAM,2818,poverty,1
1,2013,8,ASSAM,18,UDALGURI,1826,education,3
2,2013,9,PUNJAB,3,HOSHIARPUR,305,education,3
3,2013,8,TELANGANA,36,NALGONDA,3608,education,3
4,2013,4,KARNATAKA,29,KODAGU,2925,donations,4


In [6]:
dataset.isnull().sum()

YEAR          0
MONTH         0
STATNAME      0
STATCD        0
DISTNAME      0
DISTCD        0
EVENT_NAME    0
EVENT_CODE    0
dtype: int64

In [7]:
#Independent features
X = dataset.iloc[:, [1,5]].values

In [8]:
X

array([[   1, 2818],
       [   8, 1826],
       [   9,  305],
       ..., 
       [  10, 2712],
       [   2, 1102],
       [   7, 1401]])

In [9]:
print X.shape

(10000, 2)


In [10]:
#dependent variable vector
y = dataset.iloc[:, 7].values

In [11]:
y

array([1, 3, 3, ..., 3, 3, 2])

In [12]:
#Encode categorical data

from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features = [0,1])


In [13]:
def encode_data(X):
    X = onehotencoder.fit_transform(X).toarray()
    return X

In [14]:
df_X = pd.DataFrame(data=X)
df_y = pd.DataFrame(data=y)

In [15]:
df_X.head()

Unnamed: 0,0,1
0,1,2818
1,8,1826
2,9,305
3,8,3608
4,4,2925


In [16]:
print X.shape

(10000, 2)


In [17]:
X = encode_data(X)

In [18]:
print X.shape

(10000, 112)


In [19]:
#Splitting the dataset into training and test set

from sklearn.cross_validation import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X, y, test_size=0.2 , random_state=0)



In [20]:
print X_train.shape
print X_test.shape
print y_train.shape
print y_test.shape

(8000, 112)
(2000, 112)
(8000,)
(2000,)


In [21]:
X_train , X_test , y_train , y_test

(array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  1.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]),
 array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  1.,  0., ...,  0.,  0.,  0.],
        [ 1.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]),
 array([1, 1, 2, ..., 1, 2, 3]),
 array([4, 3, 4, ..., 3, 3, 4]))

In [22]:
#Feature Scaling

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()

def get_scaled_data(X_train, X_test):
    X_train = sc_X.fit_transform(X_train)
    X_test = sc_X.transform(X_test)
    return X_train, X_test

In [23]:
X_train, X_test = get_scaled_data(X_train, X_test)

In [24]:
X_train

array([[-0.29986373, -0.29388111, -0.3050336 , ..., -0.10666772,
        -0.10113636, -0.10903992],
       [-0.29986373, -0.29388111, -0.3050336 , ..., -0.10666772,
        -0.10113636, -0.10903992],
       [-0.29986373, -0.29388111, -0.3050336 , ..., -0.10666772,
        -0.10113636, -0.10903992],
       ..., 
       [-0.29986373,  3.4027366 , -0.3050336 , ..., -0.10666772,
        -0.10113636, -0.10903992],
       [-0.29986373, -0.29388111, -0.3050336 , ..., -0.10666772,
        -0.10113636, -0.10903992],
       [-0.29986373, -0.29388111, -0.3050336 , ..., -0.10666772,
        -0.10113636, -0.10903992]])

## Fitting KNN classifier to the training set

In [25]:
from sklearn.neighbors import KNeighborsClassifier

In [74]:
def fit_model(X_train, y_train):
    model_knn = KNeighborsClassifier(n_neighbors=8, metric='minkowski' , p=2)
    model_knn.fit(X_train , y_train)
    return model_knn

In [75]:
model_knn = fit_model(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=8, p=2,
           weights='uniform')

In [27]:
y_pred = model_knn.predict(X_test)

In [28]:
print X_test.shape

(2000, 112)


In [29]:
print y_pred.shape

(2000,)


In [30]:
y_pred[:5]

array([1, 2, 4, 3, 4])

In [31]:
# saving the model

import pickle
from sklearn.externals import joblib
filename = 'knn_model.pkl'
joblib.dump(model_knn, filename)

['knn_model.pkl']

In [32]:
# load the model from disk

loaded_model = joblib.load(filename)
score = loaded_model.score(X_test, y_test)
print("Test score: {0:.2f} %".format(100 * score))

Test score: 25.50 %


In [33]:
print X_test.dtype

float64


## Tesing on 2014 data

In [34]:
X_2014 = pd.read_csv("dataset_2014.csv")

In [35]:
X_2014.head()

Unnamed: 0,YEAR,MONTH,STATNAME,STATCD,DISTNAME,DISTCD
0,2014,9,JAMMU & KASHMIR,1,KISHTWAR,117
1,2014,12,SIKKIM,11,WEST SIKKIM,1102
2,2014,5,TRIPURA,16,SOUTH TRIPURA,1602
3,2014,8,TRIPURA,16,SOUTH TRIPURA,1602
4,2014,9,HARYANA,6,MAHENDRAGARH,616


In [36]:
X_2014_data = X_2014.iloc[:, [1,5]].values

In [37]:
X_2014_data.shape

(10000, 2)

In [70]:
k = 1000
def make_prediction_data() :
    count=0
    for row in range (0, 10000, k) :
        x_write = []
        count+=1
        X = X_2014_data[row:row+k, :]
        for i in xrange(k):   
            x_write.append(list(X_2014.iloc[i, :].values))
        X_encoded = encode_data(X)
        _ , X_test_2014 = get_scaled_data(X_train, X_encoded)
        predict_data(X_test_2014, x_write, k)


event_dict = {1: 'poverty', 2: 'health_care', 3: 'education', 4: 'donations'}

def predict_data(x_test, x_file, k) :
    
    y_pred = model_knn.predict(x_test)
        
    for i in range(k) :
        event = event_dict[y_pred[i]]
        x_file[i].append(event)
        x_file[i].append(y_pred[i])
        write_to_file(x_file[i])

import csv
def write_to_file(row) :
        with open('helpinghands_dataset.csv', 'a') as csvFile:
            writer = csv.writer(csvFile)
            writer.writerow(row)
        csvFile.close()

In [71]:
make_prediction_data()

In [76]:
from sklearn.cross_validation import train_test_split
def show_prediction_info():
    pd.read_csv("helpinghands_dataset.csv")
    X = dataset.iloc[:, [1,5]].values
    y = dataset.iloc[:, 7].values
    print X.shape
    print y.shape
    X = encode_data(X)
    X_train , X_test , y_train , y_test = train_test_split(X, y, test_size=0.2 , random_state=0)
    print X_train.shape
    print X_test.shape
    print y_train.shape
    print y_test.shape
    X_train, X_test = get_scaled_data(X_train, X_test)
    model = fit_model(X_train, y_train)
    y_pred = model_knn.predict(X_test)
    score = loaded_model.score(X_test, y_test)
    print("Test score: {0:.2f} %".format(100 * score))
    

In [77]:
show_prediction_info()

(10000, 2)
(10000,)
(8000, 112)
(2000, 112)
(8000,)
(2000,)
Test score: 25.50 %
