In [1]:
#Importing the libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import csv

In [10]:
class DummyDataBuilderEncoded:

    def __init__(self):
        self.months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
        self.events = ['poverty', 'health_care', 'education', 'donations']
        self.event_codes = [1, 2, 3, 4]
        self.event_dict = {1: 'poverty', 2: 'health_care', 3: 'education', 4: 'donations'}

    def generate_data(self):
            for i in xrange(10000):
                year = 2010
                self.organise_data_from_csv(year)

    def organise_data_from_csv(self, year):
        states_districts = pd.read_csv("important_districts.csv").iloc[:, 1: 5]
        random_row = states_districts.sample(1)
        state = ''.join(random_row['STATNAME'].values)
        state_code = int(random_row['STATCD'])
        district = ''.join(random_row['DISTNAME'].values)
        district_code = int(random_row['DISTCD'])

        month = random.choice(self.months)
        event_code = random.choice(self.event_codes)
        event = self.event_dict[event_code]

        self.write_data_to_csv(year, month, state, state_code, district, district_code, event, event_code)

    @staticmethod
    def write_data_to_csv(year, month, state, state_code, district, district_code, event, event_code):

        row = [year, month, state, state_code, district, district_code, event, event_code]
        with open('random_dataset_2010.csv', 'a') as csvFile:
            writer = csv.writer(csvFile)
            writer.writerow(row)
        csvFile.close()

###### using important distriscts only to generate random data.

In [11]:
DummyDataBuilderEncoded().generate_data()

In [12]:
# now a random dataset for year 2010 is generated 

In [13]:
#Importing the dataset
dataset = pd.read_csv("random_dataset_2010.csv")

In [14]:
dataset.head()

Unnamed: 0,YEAR,MONTH,STATNAME,STATCD,DISTNAME,DISTCD,EVENT_NAME,EVENT_CODE
0,2010,5,UTTAR PRADESH,9,KUSHINAGAR,959,donations,4
1,2010,7,CHANDIGARH,4,CHANDIGARH,401,health_care,2
2,2010,6,JHARKHAND,20,GODDA,2008,health_care,2
3,2010,3,MADHYA PRADESH,23,SIDHI,2317,health_care,2
4,2010,2,MADHYA PRADESH,23,ALIRAJPUR,2349,donations,4


In [15]:
#Independent features - month and district code
X = dataset.iloc[:, [1,5]].values

In [16]:
X[:5]

array([[   5,  959],
       [   7,  401],
       [   6, 2008],
       [   3, 2317],
       [   2, 2349]])

In [17]:
print X.shape

(10000, 2)


In [18]:
#dependent variable vector - event code
y = dataset.iloc[:, 7].values

In [19]:
y[:5]

array([4, 2, 2, 2, 4])

In [20]:
#Encode categorical data

from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features = [0,1])

In [21]:
def encode_data(X):
    X = onehotencoder.fit_transform(X).toarray()
    return X

In [22]:
X = encode_data(X)
print X.shape

(10000, 112)


In [25]:
#Splitting the dataset into training and test set

from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X, y, test_size=0.25 , random_state=0)

In [26]:
print X_train.shape
print X_test.shape
print y_train.shape
print y_test.shape

(7500, 112)
(2500, 112)
(7500,)
(2500,)


In [27]:
#Feature Scaling

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()

def get_scaled_data(X_train, X_test):
    X_train = sc_X.fit_transform(X_train)
    X_test = sc_X.transform(X_test)
    return X_train, X_test

In [28]:
X_train, X_test = get_scaled_data(X_train, X_test)

In [29]:
print X_train.shape , X_test.shape

(7500, 112) (2500, 112)


## Fitting KNN classifier to the training set

In [30]:
from sklearn.neighbors import KNeighborsClassifier

In [31]:
def fit_model(X_train, y_train):
    model_knn = KNeighborsClassifier(n_neighbors=8, metric='minkowski' , p=2)
    model_knn.fit(X_train , y_train)
    return model_knn

In [32]:
model_knn = fit_model(X_train, y_train)

In [33]:
y_pred = model_knn.predict(X_test)

In [34]:
print y_pred.shape

(2500,)


In [5]:
# saving the model
from sklearn.externals import joblib
filename = 'knn_model_trained_on_2010.pkl'

In [7]:
joblib.dump(model_knn, filename)

In [9]:
# load the model from disk
loaded_model = joblib.load(filename)

In [None]:
score = loaded_model.score(X_test, y_test)
print("Test score: {0:.2f} %".format(100 * score))

In [37]:
# now we have a model ready that is trained on 2010 random dataset.
# After this , for every next year event code and event name will be predicted using the previously learned model_knn 

In [None]:
# for making datasets for each year without events 

In [10]:
class DummyDataBuilderEncodedWithoutEvent:

    def __init__(self):
        self.years = [2011, 2012, 2013, 2014, 2015, 2016, 2017]
        self.months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

    def generate_data(self):
        for year in self.years:
            filename = "dataset_"+str(year)+".csv"
            row = ["YEAR", "MONTH", "STATNAME", "STATCD", "DISTNAME", "DISTCD"]

            with open(filename, 'a') as csvFile:
                writer = csv.writer(csvFile)
                writer.writerow(row)
            csvFile.close()
            for i in xrange(10000):
                self.organise_data_from_csv(year, filename)

    def organise_data_from_csv(self, year, filename):
        states_districts = pd.read_csv("important_districts.csv").iloc[:, 1: 5]
        random_row = states_districts.sample(1)
        state = ''.join(random_row['STATNAME'].values)
        state_code = int(random_row['STATCD'])
        district = ''.join(random_row['DISTNAME'].values)
        district_code = int(random_row['DISTCD'])

        month = random.choice(self.months)

        self.write_data_to_csv(year, month, state, state_code, district, district_code, filename)

    @staticmethod
    def write_data_to_csv(year, month, state, state_code, district, district_code, filename):

        row = [year, month, state, state_code, district, district_code]

        with open(filename, 'a') as csvFile:
            writer = csv.writer(csvFile)
            writer.writerow(row)

        csvFile.close()


In [43]:
DummyDataBuilderEncodedWithoutEvent().generate_data()
    

In [11]:
# now we have datasets for year 2011 to 2017 but without corresponding events
# events of these years will be predicted using learned models and predicted rows will be added to the dataset

###### predicting and genrating dataset

In [45]:
import csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

class PredictGenerate():
    def __init__(self):
        self.years = [2011, 2012, 2013, 2014, 2015, 2016, 2017]
        self.sc_X = StandardScaler()
        self.onehotencoder = OneHotEncoder(categorical_features = [0,1])
        self.event_dict = {1: 'poverty', 2: 'health_care', 3: 'education', 4: 'donations'}
        self.model_knn = joblib.load('knn_model_trained_on_2010.pkl')
        
    
    def set_up_data(self):
        for year in self.years:
            filename = "dataset_"+str(year)+".csv"
            df = pd.read_csv(filename)
            X_data = df.iloc[:, [1,5]].values
            self.make_prediction_data(df, X_data, year)

    def make_prediction_data(self, df, X_data, year) :
        k = 1000
        for row in range (0, 10000, k) :
            x_write = []
            X = X_data[row:row+k, :]
            for i in xrange(k):   
                x_write.append(list(df.iloc[i, :].values))
                
            X_encoded = self.encode_data(X)
            X_test = self.get_scaled_test_data(X_encoded)
            self.predict_data(X_test, x_write, k, year)
            self.make_new_model(year)
            
    def predict_data(self, x_test, x_file, k, year) :
        y_pred = self.model_knn.predict(x_test)
        for i in range(k) :
            event = self.event_dict[y_pred[i]]
            x_file[i].append(event)
            x_file[i].append(y_pred[i])
            self.write_to_file(x_file[i], year)
            
    def write_to_file(self, row, year) :
        with open('knn_dataset.csv', 'a') as csvFile:
            writer = csv.writer(csvFile)
            writer.writerow(row)
        csvFile.close()
        
    
    def make_new_model(self, year):
        dataset = pd.read_csv("knn_dataset.csv")
        X = dataset.iloc[:, [1,5]].values
        y = dataset.iloc[:, 7].values
        X = self.encode_data(X)
        X_train , X_test , y_train , y_test = train_test_split(X, y, test_size=0.25 , random_state=0)
        X_train, X_test = self.get_scaled_data(X_train, X_test)
        self.model_knn = self.fit_model(X_train, y_train)
        filename = 'knn_model.pkl'
        joblib.dump(self.model_knn, filename)
        score = self.model_knn.score(X_test, y_test)
        print("Test score for year {} : {}%".format(year, 100 * score))

    def fit_model(seld, X_train, y_train):
        model_knn = KNeighborsClassifier(n_neighbors=8, metric='minkowski' , p=2)
        model_knn.fit(X_train , y_train)
        return model_knn
        
    def get_scaled_test_data(self, X_test):
        X_test = self.sc_X.fit_transform(X_test)
        return X_test

    def get_scaled_data(self, X_train, X_test):
        X_train = self.sc_X.fit_transform(X_train)
        X_test = self.sc_X.fit_transform(X_test)
        return X_train, X_test

    def encode_data(self, X):
        X = self.onehotencoder.fit_transform(X).toarray()
        return X

        

In [46]:
PredictGenerate().set_up_data()

Test score for year 2011 : 31.1636363636%
Test score for year 2011 : 29.3333333333%
Test score for year 2011 : 27.2923076923%
Test score for year 2011 : 28.5428571429%
Test score for year 2011 : 26.9333333333%
Test score for year 2011 : 26.625%
Test score for year 2011 : 26.9176470588%
Test score for year 2011 : 27.5777777778%
Test score for year 2011 : 26.8631578947%
Test score for year 2011 : 27.28%
Test score for year 2012 : 30.6285714286%
Test score for year 2012 : 29.8363636364%
Test score for year 2012 : 28.9565217391%
Test score for year 2012 : 28.9666666667%
Test score for year 2012 : 29.568%
Test score for year 2012 : 29.0153846154%
Test score for year 2012 : 29.5407407407%
Test score for year 2012 : 29.9142857143%
Test score for year 2012 : 30.4413793103%
Test score for year 2012 : 30.92%
Test score for year 2013 : 31.7419354839%
Test score for year 2013 : 32.2625%
Test score for year 2013 : 31.9151515152%
Test score for year 2013 : 31.8352941176%
Test score for year 2013 : 3

In [54]:
class PredictGenerate():
    def __init__(self, algo):
        self.years = [2011, 2012, 2013, 2014, 2015, 2016, 2017]
        self.sc_X = StandardScaler()
        self.onehotencoder = OneHotEncoder(categorical_features=[0, 1])
        self.event_dict = {1: 'poverty', 2: 'health_care', 3: 'education', 4: 'donations'}
        self.algo = algo
        self.model = self.get_basic_model()

    def get_basic_model(self):
        dataset = pd.read_csv("random_dataset_2010.csv")
        X = dataset.iloc[:, [1, 5]].values
        y = dataset.iloc[:, 7].values
        onehotencoder = OneHotEncoder(categorical_features=[0, 1])
        X = onehotencoder.fit_transform(X).toarray()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
        sc_X = StandardScaler()
        X_train = sc_X.fit_transform(X_train)
        model = self.fit_model(X_train, y_train)
        filename = self.algo+"_model_trained_on_2010.pkl"
        joblib.dump(model, filename)
        return model

    def set_up_data(self):
        for year in self.years:
            filename = "dataset_" + str(year) + ".csv"
            df = pd.read_csv(filename)
            X_data = df.iloc[:, [1, 5]].values
            self.make_prediction_data(df, X_data, year)

    def make_prediction_data(self, df, X_data, year):
        k = 1000
        for row in range(0, 10000, k):
            x_write = []
            X = X_data[row:row + k, :]
            for i in xrange(k):
                x_write.append(list(df.iloc[i, :].values))

            X_encoded = self.encode_data(X)
            X_test = self.get_scaled_test_data(X_encoded)
            self.predict_data(X_test, x_write, k, year)
            self.make_new_model(year)

    def predict_data(self, x_test, x_file, k, year):
        y_pred = self.model.predict(x_test)
        for i in range(k):
            event = self.event_dict[y_pred[i]]
            x_file[i].append(event)
            x_file[i].append(y_pred[i])
            self.write_to_file(x_file[i], year)

    def write_to_file(self, row, year):
        filename = self.algo+"_dataset.csv"
        with open(filename, 'a') as csvFile:
            writer = csv.writer(csvFile)
            writer.writerow(row)
        csvFile.close()

    def make_new_model(self, year):
        filename = self.algo + "_dataset.csv"
        dataset = pd.read_csv(filename)
        X = dataset.iloc[:, [1, 5]].values
        y = dataset.iloc[:, 7].values
        X = self.encode_data(X)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
        X_train, X_test = self.get_scaled_data(X_train, X_test)
        self.model = self.fit_model(X_train, y_train)
        filename = self.algo+"_model.pkl"
        joblib.dump(self.model, filename)
        score = self.model.score(X_test, y_test)
        print("Test score for year {} : {}%".format(year, 100 * score))

    def fit_model(self, X_train, y_train):
        model = None
        if self.algo == "knn":
            model = KNeighborsClassifier(n_neighbors=8, metric='minkowski', p=2)
            model.fit(X_train, y_train)
        elif self.algo == "svm":
            model = SVC(kernel='linear', random_state=0)
            model.fit(X_train, y_train)

        elif self.algo == "decision_tree":
            model = DecisionTreeClassifier(criterion='entropy', random_state=0)
            model.fit(X_train, y_train)

        elif self.algo == "random_forest":
            model = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
            model.fit(X_train, y_train)

        elif self.algo == "nb":
            model = GaussianNB()
            model.fit(X_train, y_train)

        return model

    def get_scaled_test_data(self, X_test):
        X_test = self.sc_X.fit_transform(X_test)
        return X_test

    def get_scaled_data(self, X_train, X_test):
        X_train = self.sc_X.fit_transform(X_train)
        X_test = self.sc_X.fit_transform(X_test)
        return X_train, X_test

    def encode_data(self, X):
        X = self.onehotencoder.fit_transform(X).toarray()
        return X


In [55]:
# print "KNN : ********"
# PredictGenerate().set_up_data("knn")


print "SVM : ********"
PredictGenerate("svm").set_up_data()

print "Random Forest : ********"
PredictGenerate("random_forest").set_up_data()

print "Decision Tree : ********"
PredictGenerate("decision_tree").set_up_data()

print "Naive Bayes : ********"
PredictGenerate("nb").set_up_data()

SVM : ********
Test score for year 2011 : 92.0606060606%
Test score for year 2011 : 92.2941176471%
Test score for year 2011 : 92.4685714286%
Test score for year 2011 : 92.7111111111%
Test score for year 2011 : 92.4864864865%
Test score for year 2011 : 92.8736842105%
Test score for year 2011 : 92.8820512821%
Test score for year 2011 : 92.92%
Test score for year 2011 : 93.2195121951%
Test score for year 2011 : 93.4952380952%
Test score for year 2012 : 93.888372093%
Test score for year 2012 : 93.7454545455%
Test score for year 2012 : 93.9466666667%
Test score for year 2012 : 94.0695652174%
Test score for year 2012 : 93.9659574468%
Test score for year 2012 : 94.4833333333%
Test score for year 2012 : 94.4734693878%
Test score for year 2012 : 94.84%
Test score for year 2012 : 94.8235294118%
Test score for year 2012 : 95.0307692308%
Test score for year 2013 : 95.0641509434%
Test score for year 2013 : 95.0148148148%
Test score for year 2013 : 95.1927272727%
Test score for year 2013 : 95.457142

Test score for year 2016 : 62.5485714286%
Test score for year 2017 : 62.8732394366%
Test score for year 2017 : 63.3555555556%
Test score for year 2017 : 63.6328767123%
Test score for year 2017 : 64.2486486486%
Test score for year 2017 : 64.4906666667%
Test score for year 2017 : 65.5631578947%
Test score for year 2017 : 66.0675324675%
Test score for year 2017 : 66.3538461538%
Test score for year 2017 : 66.6734177215%
Test score for year 2017 : 67.285%
Naive Bayes : ********
Test score for year 2011 : 35.2363636364%
Test score for year 2011 : 35.7666666667%
Test score for year 2011 : 33.2923076923%
Test score for year 2011 : 31.5714285714%
Test score for year 2011 : 30.8533333333%
Test score for year 2011 : 30.375%
Test score for year 2011 : 30.2352941176%
Test score for year 2011 : 29.3555555556%
Test score for year 2011 : 29.5578947368%
Test score for year 2011 : 28.92%
Test score for year 2012 : 33.6761904762%
Test score for year 2012 : 33.4363636364%
Test score for year 2012 : 32.208