In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from numpy import *
from sklearn.model_selection import train_test_split
from numpy import *


plt.style.use('ggplot')


In this cell, I am loading the training dataset which is already labelled and I am ignoring the unnamed row which doesn't count as a feature. 

In [96]:
df = pd.read_csv("df_training_scholarjet.csv")
df = df.loc[:, ~df.columns.str.contains('Unnamed')]
print (df.head())

   cuid  convert_30  revenue_30     roll_up currentstatus companytypegroup  \
0    20           0         0.0  Onboarding        Active         Business   
1    57           0         0.0  Onboarding        Active         Business   
2   163           0         0.0  Onboarding        Active         Business   
3   167           0         0.0  Onboarding      Enrolled         Business   
4   168           0         0.0  Onboarding      Enrolled         Business   

  team            customersource accrole num_employees  ...  \
0   US      External Application    None        50plus  ...   
1   US      Internal Application    None          2to5  ...   
2   US      Internal Application    None             1  ...   
3   US  Internal Customer Scrape    None        50plus  ...   
4   US                   Gateway    None         6to10  ...   

  percemailopenedyearsixty percemailclickedone percemailclickedthreeone  \
0                 0.000000                 0.0                      0.0   
1 

In this prediction model, "Convert_30" and "revenue_30" are the labels to be predicted.

In [68]:
labels1 = np.asarray(df.convert_30)
labels2 = np.asarray(df.revenue_30)

In this cell, I am encoding the labels using the label encoder.

In [69]:
le = LabelEncoder()
le.fit(labels1)
labels1 = le.transform(labels1)

le2 = LabelEncoder()
le2.fit(labels2)
labels2 = le2.transform(labels2)

In the below cell, I am dropping the following columns because they are the labels to be predicted and they doesn't count as features.

In [70]:
df_selected = df.drop(['revenue_30', 'cuid', 'convert_30'], axis=1)

In this cell, I am extracting features from the training dataset and fitting the features into an array using the "DictVectorizer"

In [72]:
df_features = df_selected.to_dict(orient='records')
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
features = vec.fit_transform(df_features).toarray()
where_are_NaNs = isnan(features)
features[where_are_NaNs] = 0
print(features)

[[  1.     0.     0.   ...   0.     0.    59.99]
 [  1.     0.     0.   ...   0.    78.   126.48]
 [  1.     0.     0.   ...   0.     0.   237.98]
 ...
 [  0.     0.     0.   ...   0.   119.     0.  ]
 [  1.     0.     0.   ...   0.     0.     0.  ]
 [  1.     0.     0.   ...   0.     0.   409.99]]


In this particular cell, I am splitting the training and testing dataset into 90% and 10% composition respectively to find the accuracy.

In [73]:
features_train1, features_test1, labels_train1, labels_test1 = train_test_split(
    features, labels1, 
    test_size=0.10, random_state=42)

features_train2, features_test2, labels_train2, labels_test2 = train_test_split(
    features, labels2, 
    test_size=0.10, random_state=42)



Training the model using Random Forest Classifier.



In [75]:
from sklearn.ensemble import RandomForestClassifier
clf1 = RandomForestClassifier()
clf1.fit(features_train1, labels_train1)

acc_test = clf1.score(features_test1, labels_test1)
print ("Test Accuracy:", acc_test)



Test Accuracy: 0.8944187699964451


Training the model using SVM algorithm.

In [76]:
from sklearn import svm
clf2 = svm.SVC(gamma='scale', C=1.0, kernel='rbf')
clf2.fit(features_train1, labels_train1)

acc_test = clf2.score(features_test1, labels_test1)
print ("Test Accuracy:", acc_test)

Test Accuracy: 0.8944187699964451


Loading the testing dataset and eliminating the unnamed columns.

In [81]:
df2 = pd.read_csv("df_holdout_scholarjet.csv")
df2 = df2.loc[:, ~df2.columns.str.contains('Unnamed')]
print (df2.head())

cuid = df2['cuid'].values
print(cuid)

     cuid     roll_up currentstatus companytypegroup team  \
0   16838  Onboarding      Enrolled         Business   US   
1  532175  Onboarding      Enrolled         Business   US   
2  532176  Onboarding      Enrolled         Business   US   
3  532187  Onboarding      Enrolled         Business   US   
4   16938  Onboarding      Enrolled            Trade   US   

             customersource  accrole num_employees num_purchases_year  \
0      Internal Application     None             1               1to2   
1             Search - Paid     None         6to10               None   
2      Internal Application     None        11to50               3to5   
3      Internal Application  Primary          None               None   
4  Internal Customer Scrape  Primary          None               None   

  cost_purchases_year  ... percemailopenedyearsixty  percemailclickedone  \
0           lessthan1  ...                 0.025806                  0.0   
1                None  ...                

Similarly creating the features for the testing dataset and predicting the "Convert_30" label for the testing dataset.

In [82]:
df_selected = df2.drop(['cuid'], axis=1)
df_features = df_selected.to_dict(orient='records')
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
features = vec.fit_transform(df_features).toarray()
where_are_NaNs = isnan(features)
features[where_are_NaNs] = 0


output = clf1.predict_proba(features)
pred = output[:,1]
print(pred)
    

unique, counts = np.unique(pred, return_counts=True)
dict(zip(unique, counts))

[0.  0.5 0.3 ... 0.  0.  0. ]


{0.0: 15918,
 0.1: 6870,
 0.2: 3392,
 0.3: 1854,
 0.4: 1120,
 0.5: 570,
 0.6: 388,
 0.7: 165,
 0.8: 72,
 0.9: 23,
 1.0: 3}

Creating a revenue prediction model by using Decision tree classifier with a depth of 10

In [93]:
from sklearn import tree
import graphviz

t = tree.DecisionTreeClassifier(criterion='entropy', max_depth=10)
t = t.fit(features_train2, labels_train2)
accuracy = t.score(features_test2, labels_test2)
print(accuracy)

0.8563810878066122


Prediction of "revenue_30" label for the testing dataset.

In [94]:
pred2 = t.predict(features)


unique, counts = np.unique(pred2, return_counts=True)
dict(zip(unique, counts))

{0: 29000,
 1: 5,
 2: 2,
 4: 1,
 5: 7,
 7: 10,
 8: 1,
 9: 3,
 16: 2,
 17: 4,
 23: 5,
 24: 7,
 25: 10,
 27: 4,
 30: 5,
 32: 7,
 33: 1,
 37: 3,
 38: 1,
 39: 3,
 44: 4,
 45: 4,
 46: 4,
 52: 7,
 53: 14,
 57: 1,
 62: 1,
 64: 10,
 66: 4,
 67: 5,
 68: 2,
 69: 8,
 72: 5,
 76: 5,
 78: 4,
 81: 4,
 85: 1,
 93: 3,
 96: 6,
 103: 5,
 107: 3,
 109: 9,
 113: 8,
 115: 2,
 123: 3,
 125: 1,
 129: 6,
 130: 3,
 133: 4,
 135: 1,
 138: 3,
 139: 15,
 141: 3,
 144: 4,
 146: 6,
 148: 5,
 149: 1,
 152: 4,
 153: 1,
 160: 4,
 161: 6,
 167: 5,
 174: 9,
 175: 3,
 180: 3,
 182: 7,
 183: 4,
 185: 1,
 193: 3,
 195: 7,
 200: 6,
 202: 6,
 203: 3,
 204: 5,
 210: 2,
 213: 2,
 215: 4,
 217: 5,
 221: 10,
 227: 7,
 229: 4,
 232: 1,
 233: 1,
 236: 2,
 237: 3,
 240: 7,
 241: 2,
 244: 3,
 246: 3,
 248: 2,
 250: 4,
 252: 2,
 254: 1,
 255: 5,
 256: 5,
 258: 14,
 259: 1,
 260: 1,
 261: 15,
 262: 3,
 265: 6,
 266: 5,
 268: 10,
 271: 9,
 275: 4,
 290: 4,
 301: 1,
 302: 2,
 303: 1,
 312: 4,
 316: 5,
 323: 9,
 325: 10,
 326: 2,
 332: 1

Writing all the predictions in a csv file

In [95]:
import csv

with open('predictions.csv', mode='w') as f:
    mywriter = csv.writer(f, delimiter=',')
    myrow = ['cuid']
    row = ['convert_30']
    myrow.extend(row)
    row = ['revenue_30']
    myrow.extend(row)
    mywriter.writerow(myrow)
    
    for i in range(len(cuid)):
        myrow = [cuid[i]]
        row = [pred[i]]
        myrow.extend(row)
        row = [pred2[i]]
        myrow.extend(row)
        mywriter.writerow(myrow)
