# Module Imports

In [233]:
import pickle
import numpy
import csv
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction import FeatureHasher
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler

# Import data

In [234]:
def importData(filename):
    X=[]
    Y=[]
    with open(filename,'r') as data_file:
        counter=0
        for line in data_file:
            if(counter>0):
                counter=counter+1
                line_list=line.split(",")
                Y.append(line_list[0])
                X.append([line_list[1],line_list[2],line_list[3].split("|")[0].strip("\n"),line_list[3].split("|")[1].strip("\n")])
                
            else:
                counter=counter+1
        
    return X,Y       

In [235]:
X,Y=importData('dataset_2.csv')

In [236]:
X[:10]

[['>10', '<4.4', 'HR', 'AIRTEL'],
 ['>5', '<5', 'UP', 'JIO'],
 ['>10', '<5', 'AP', 'AIRTEL'],
 ['>5', '<5', 'RJ', 'JIO'],
 ['>10', '<5.6', 'WB', 'VODAFONE'],
 ['>10', '<5.6', 'HR', 'JIO'],
 ['>5', '<5', 'TN', 'JIO'],
 ['>20', '<6', 'DL', 'AIRTEL'],
 ['>5', '<5', 'MH', 'JIO'],
 ['>10', '<5.6', 'UK', 'AIRTEL']]

# Inspect X1 and X2 Features

In [237]:
x1=[]
x2=[]
x1=set(x1)
x2=set(x2)
for i in range(len(X)):
    x1.add(float(X[i][0][1:]))
    x2.add(float(X[i][1][1:]))

In [238]:
x1

{5.0, 10.0, 20.0, 30.0, 40.0}

In [239]:
x2

{4.0, 4.2, 4.4, 4.6, 4.8, 5.0, 5.2, 5.4, 5.6, 5.8, 6.0}

# One Hot Encoding

In [240]:
print('Creating Vectorizer for whole data set')
vectorizer = DictVectorizer(sparse=False)
vectorizer_input=[]
for eachRow in X:
    temp_dic={}
    temp_count=1
    for data in eachRow:
        temp_dic['X'+str(temp_count)] = data
        temp_count=temp_count+1
    vectorizer_input.append(temp_dic)
print("Dictonary complete")
output_all = vectorizer.fit(vectorizer_input)


Creating Vectorizer for whole data set
Dictonary complete


In [241]:
vectorizer_input[3000]

{'X1': '>10', 'X2': '<5.6', 'X3': 'MH', 'X4': 'JIO'}

In [242]:
len(vectorizer_input)

3209

# Train test split

In [243]:
train_X=vectorizer_input[:2500]
train_Y=Y[:2500]
test_X=vectorizer_input[2500:]
test_Y=Y[2500:]

# Serialize the one hot encoder

In [244]:
with open('dictvectorizer.pkl', 'wb') as fid:
    pickle.dump(vectorizer, fid)

In [245]:
vectorizer.transform(vectorizer_input[0]).shape

(1, 55)

# Train a simple perceptron model

In [246]:
#Perceptron Model
#clf = Perceptron(n_iter=30, verbose=1, eta0=0.2, penalty='l2', alpha=0.001)
clf = Perceptron(n_iter=11, verbose=1)
clf.fit(vectorizer.transform(train_X),train_Y)

-- Epoch 1
Norm: 8.37, NNZs: 34, Bias: -3.000000, T: 2500, Avg. loss: 0.817200
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 11.66, NNZs: 34, Bias: -3.000000, T: 5000, Avg. loss: 0.806800
Total training time: 0.00 seconds.
-- Epoch 3
Norm: 10.68, NNZs: 43, Bias: -2.000000, T: 7500, Avg. loss: 0.796400
Total training time: 0.00 seconds.
-- Epoch 4
Norm: 11.14, NNZs: 46, Bias: -5.000000, T: 10000, Avg. loss: 0.792800
Total training time: 0.00 seconds.
-- Epoch 5
Norm: 9.70, NNZs: 41, Bias: -2.000000, T: 12500, Avg. loss: 0.827200
Total training time: 0.00 seconds.
-- Epoch 6
Norm: 12.57, NNZs: 42, Bias: -4.000000, T: 15000, Avg. loss: 0.784800
Total training time: 0.00 seconds.
-- Epoch 7
Norm: 10.86, NNZs: 42, Bias: -5.000000, T: 17500, Avg. loss: 0.787200
Total training time: 0.00 seconds.
-- Epoch 8
Norm: 12.41, NNZs: 37, Bias: -3.000000, T: 20000, Avg. loss: 0.780000
Total training time: 0.00 seconds.
-- Epoch 9
Norm: 11.49, NNZs: 43, Bias: -3.000000, T: 22500, Avg. loss: 0.816



Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      max_iter=11, n_iter=None, n_jobs=1, penalty=None, random_state=0,
      shuffle=True, tol=None, verbose=1, warm_start=False)

In [247]:
predictions=clf.predict(vectorizer.transform(test_X)).tolist()

In [248]:
len(predictions)

709

In [249]:
from sklearn.metrics import accuracy_score
accuracy_score(test_Y,predictions)

0.89562764456981669

# Manual Testing to verify predictions

In [133]:
test_X[0]

{'X1': '>10', 'X2': '<5.2', 'X3': 'RJ', 'X4': 'VODAFONE'}

In [134]:
clf.predict(vectorizer.transform(test_X[0]))

array(['FALSE'],
      dtype='|S5')

# Train a SVM model

In [135]:
clf = svm.SVC(verbose=True)
clf.fit(vectorizer.transform(train_X),train_Y)

[LibSVM]

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=True)

In [136]:
predictions=clf.predict(vectorizer.transform(test_X)).tolist()
accuracy_score(test_Y,predictions)

0.89421720733427368

# Random Forest Classifier

In [105]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=55, criterion='entropy')
clf.fit(vectorizer.transform(train_X),train_Y)
predictions=clf.predict(vectorizer.transform(test_X)).tolist()
accuracy_score(test_Y,predictions)

0.87588152327221436