# Loading Modules

In [1]:
from scipy import interp
import seaborn as sns
import numpy as np
import pandas as pd
import csv
import datetime
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix, classification_report, recall_score, precision_score, precision_recall_curve, roc_curve, auc
from sklearn.cross_validation import train_test_split, cross_val_score, KFold, StratifiedKFold
from patsy import dmatrices
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import statsmodels.api as sm
from sklearn.svm import LinearSVC

In [2]:
def prob_adjusted_outcome(model_probabilities,threshold):
    y_adjusted=[]
    for prob in model_probabilities:
        if prob[1] > threshold:
            y_adjusted.append(True)
        else:
            y_adjusted.append(False)
    return y_adjusted

# Loading Dataset and Reordering Data

In [3]:
filename='ATA_data.csv'

df = pd.read_csv(filename)
df=df.drop('Unnamed: 0',1)

#### Taking interesting features

In [16]:
cols_to_keep=['installed', 'clicked', 'impression', 'app_type', 'weekday', 'hours', 'device', 'country', 'publisher_name', 'app_name']
filtered_df=df[cols_to_keep]

### Downsampling

In [17]:
ratio=np.mean(df['installed'])

In [18]:
#Downsampling by target downsample rate
TotalData=130000
Fsamples=int(np.round(TotalData*(1-ratio)))

In [19]:
False_df=filtered_df[filtered_df['installed'] == False]
Truth_df=filtered_df[filtered_df['installed'] == True]

df2_false=resample(False_df, n_samples=Fsamples, random_state=0, replace = False)
df2_truth=resample(Truth_df, n_samples=TotalData-Fsamples, random_state=0, replace = False)
# Merge the downsampled False with total True outcomes
less_imbalance=pd.concat([df2_false,df2_truth])

In [20]:
less_imbalance.shape

(130000, 10)

In [28]:
y_com,X_com = dmatrices('installed ~ C(hours) + C(device) + C(weekday) + C(app_type) + C(country) +C(publisher_name)'
               ,less_imbalance, return_type='dataframe'
               )

In [29]:
y_com=np.ravel(y_com.iloc[:,1:])

In [30]:
    x_baseline = range(0,110,10)
    y_baseline = x_baseline

In [31]:
rstates=[0,1,2,11,4,5,6,7,8,9,10]
ymodel=np.empty((11,11))
lifts=np.empty((11,11))

for r,index in enumerate(rstates):
    
    X_train, X_test, y_train, y_test = train_test_split(X_com,y_com, test_size=0.2, random_state=rstates[r])

    modelLogR2 = LogisticRegression(class_weight='auto',fit_intercept=False);
    modelLogR2.fit(X_train, y_train);
    probLR2 = modelLogR2.predict_proba(X_test)
    pranks = range(0,100,10)
    pranks = pranks[::-1]
    Test_df = pd.DataFrame(probLR2[:,1], columns=['Prob'])
    Test_df['Outcome'] = y_test
    num_of_customers = np.empty((10,1))
    num_response = np.empty((10,1))
    response_rate = np.empty((10,1))
    total_pos_outcomes = Test_df['Outcome'].sum()

    for i,percent in enumerate(pranks):
        rank_percent = np.percentile(probLR2[:,1],percent)
        Prob_df = Test_df[Test_df['Prob'] > rank_percent]
        num_of_customers[i,:] = Prob_df.shape[0]
        num_response[i,:] = Prob_df['Outcome'].sum()
        response_rate[i,:] = num_response[i,:]/(total_pos_outcomes)*100


    Gain_df = pd.DataFrame(num_of_customers, columns=['No. of Impressions'])
    Gain_df['No. of Installs'] = num_response
    Gain_df['Response Rate %'] = np.round(response_rate,0)



    y_model=[0]+(list(Gain_df['Response Rate %']))

    lift = np.round(list(np.array(y_model)/np.array(x_baseline)),1)

    lift[0] = 1
    ymodel[r,:] = y_model
    lifts[r,:] = lift

In [32]:
y_predict=modelLogR2.predict(X_test)

In [33]:
print confusion_matrix(y_test, y_predict, labels=[True, False]).transpose()

[[    7  7132]
 [    3 18858]]


In [36]:
#for i,r in enumerate(rstates):
#   plt.plot( x_baseline, ymodel[i,:], '-')
plt.plot(x_baseline, y_baseline, 'rs--')  
plt.plot(x_baseline,pd.DataFrame(ymodel).mean(), '--', color=(0, 0, 0), label='Mean')
plt.xlim([0, 100.5])
plt.xlabel('% of Impressions Sampled', fontsize = 18)
plt.ylabel('% of Installs Found', fontsize = 18)
plt.title('Gains Chart', fontsize = 18)
plt.xticks(x_baseline,x_baseline, fontsize = 14)
plt.yticks(y_baseline,y_baseline, fontsize = 14)
plt.savefig('Gains_Charts.png', dpi=200)
plt.legend(['Baseline', 'Mean'], fontsize = 14, loc =0)
plt.show()

In [46]:
plt.plot(x_baseline,pd.DataFrame(ymodel).mean(), '--', color=(0.6, 0.6, 0.6), label='Mean')

[<matplotlib.lines.Line2D at 0x1080c2e50>]

In [47]:
pd.DataFrame(ymodel).mean()

0       0.000000
1      14.636364
2      31.090909
3      55.272727
4      71.090909
5      81.000000
6      87.909091
7      88.545455
8      93.272727
9      95.727273
10    100.000000
dtype: float64

In [40]:
#for i,r in enumerate(rstates):
#    plt.plot(x_baseline, lifts[i,:], '-')
plt.plot(x_baseline,pd.DataFrame(lifts).mean(), '--', color=(0, 0, 0), label='Mean')
plt.plot(x_baseline, np.ones(11), 'rs--')
plt.xlim([10, 100.5])
plt.ylim([0.5,3.0])
plt.xlabel('% of Impressions Sampled', fontsize = 18)
plt.ylabel('Lift', fontsize = 18)
plt.title('Lift Chart', fontsize = 18)
plt.xticks(x_baseline,x_baseline, fontsize = 14)
plt.yticks([0.5, 1, 1.5, 2, 2.5, 3],[0.5, 1, 1.5, 2, 2.5, 3], fontsize = 14)
plt.legend(['Mean', 'baseline'], fontsize = 14)
plt.show()
plt.savefig('Lift_Charts.png', dpi=200)

In [None]:
len(lift)

In [None]:
Prob_df.shape[0]

In [None]:
Test_df.shape

In [None]:
np.min(Prob_df['Prob'])

In [None]:
rm=[LogisticRegression(fit_intercept=False), RandomForestClassifier(), GradientBoostingClassifier()]
cvs=5
model_aucs=np.empty((len(rm),cvs))
model_recalls=np.empty((len(rm),cvs))
model_precision=np.empty((len(rm),cvs))

for i,models in enumerate(rm):
    
    model_recalls[i,:] = cross_val_score(models, X_com, y_com, scoring = 'recall', cv = cvs)
    model_precision[i,:] = cross_val_score(models, X_com, y_com, scoring = 'precision', cv = cvs)
    model_aucs[i,:] = cross_val_score(models, X_com, y_com, scoring = 'roc_auc', cv = cvs)

In [None]:
summary_matrices = np.empty((6,len(rm)))
summary_matrices[0, :] = np.mean(model_recalls, axis=1)
summary_matrices[1, :] = np.std(model_recalls, axis = 1)
summary_matrices[2, :] = np.mean(model_precision, axis=1)
summary_matrices[3, :] = np.std(model_precision, axis = 1)
summary_matrices[4, :] = np.mean(model_aucs, axis=1)
summary_matrices[5, :] = np.std(model_aucs, axis = 1)
print summary_matrices

In [None]:
summary_df=pd.DataFrame(summary_matrices)

In [None]:
summary_df

### Random Forests

In [41]:
modelRandomForest= RandomForestClassifier(class_weight='auto')
modelRandomForest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='auto', criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [42]:
importances = modelRandomForest.feature_importances_
std = np.std([tree.feature_importances_ for tree in modelRandomForest.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

In [43]:
indices.shape

(287,)

In [80]:
plt.figure()
plt.title("Feature importances", fontsize = 18)
plt.bar(range(len( importances[indices[0:10]])), importances[indices[0:10]],
       color="r", align="center")
plt.xticks(range(len( importances[indices[0:10]])), index_importance, rotation = 60, fontsize = 16)
plt.yticks(fontsize = 16)
plt.xlim([-1, 10.5])
plt.show()

In [73]:
index_importance = [u'Publisher A', u'Phone',
       u'USA', u'Sunday', u'Friday',
       u'Publisher B', u'Saturday',
       u'1 p.m.', u'Publisher C',
       u'Publisher D']

In [61]:
X_train.columns[13]

'C(hours)[T.13]'

In [72]:
less_imbalance['hours'].unique()

array([15, 19,  2,  8, 22, 12, 21, 17, 16, 10,  1, 23,  5,  4, 20, 13, 18,
       14,  0,  3,  7,  9, 11,  6])

In [70]:
X_train.columns[indices[0:10]]

Index([u'C(publisher_name)[T.Tapps]', u'C(device)[T.Phone]',
       u'C(country)[T.USA]', u'C(weekday)[T.7]', u'C(weekday)[T.5]',
       u'C(publisher_name)[T.Ensolight Interactive]', u'C(weekday)[T.6]',
       u'C(hours)[T.13]', u'C(publisher_name)[T.Scopely, Inc.]',
       u'C(publisher_name)[T.SEGA]'],
      dtype='object')

In [None]:
# Random Forest Predictions
predictedRF = modelRandomForest.predict(X_test)
probsRF = modelRandomForest.predict_proba(X_test)

In [None]:
probsRF

In [None]:
# Look at precision/recall and confusion matrix

print confusion_matrix(y_test,predictedRF, labels=[True, False]).transpose()
print classification_report(y_test,predictedRF)

In [None]:
# Look at precision/recall and confusion matrix

print confusion_matrix(y_train,predictedRF, labels=[True, False]).transpose()
print classification_report(y_train,predictedRF)

In [None]:
probs.sum()/y_test.sum()

In [None]:
def prob_class_outcome(model_probabilities,select_class):
    prob_class=[]
    for prob in model_probabilities:
            prob_class.append(prob[select_class])
    return prob_class

In [None]:
probRF1=prob_class_outcome(probsRF,1)

In [None]:
plt.hist(probRF1)

In [None]:
np.where(probRF1 == 0.0)

In [None]:
probRF1.shape()