In [1]:
#%reset

In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier,AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_predict,cross_val_score,train_test_split
from sklearn.metrics import (classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
                             , precision_recall_fscore_support, accuracy_score, roc_auc_score) 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix(
        (loader['data'], loader['indices'], loader['indptr']),
        shape=loader['shape']
    )

def PlotConfusionMatrix(y_test,pred,y_test_dismiss,y_test_click):
   
    cfn_matrix = confusion_matrix(y_test,pred)
    cfn_norm_matrix = np.array([[1.0 / y_test_dismiss,1.0/y_test_dismiss],[1.0/y_test_click,1.0/y_test_click]])
    norm_cfn_matrix = cfn_matrix * cfn_norm_matrix

    fig = plt.figure(figsize=(9,3))
    ax = fig.add_subplot(1,2,1)
    sns.heatmap(cfn_matrix,cmap='coolwarm_r',linewidths=0.5,annot=True,ax=ax)
    plt.title('Confusion Matrix')
    plt.ylabel('Real Classes')
    plt.xlabel('Predicted Classes')

    ax = fig.add_subplot(1,2,2)
    sns.heatmap(norm_cfn_matrix,cmap='coolwarm_r',linewidths=0.5,annot=True,ax=ax)

    plt.title('Normalized Confusion Matrix')
    plt.ylabel('Real Classes')
    plt.xlabel('Predicted Classes')
    plt.show()
    
    print('---Classification Report---')
    print(classification_report(y_test,pred))
    
def plotCurves(y_test, pred_prob):
    fig = plt.figure(figsize=(9,5))
    ax1 = fig.add_subplot(1,2,1)
    ax1.set_xlim([-0.05,1.05])
    ax1.set_ylim([-0.05,1.05])
    ax1.set_xlabel('Recall')
    ax1.set_ylabel('Precision')
    ax1.set_title('PR Curve')

    ax2 = fig.add_subplot(1,2,2)
    ax2.set_xlim([-0.05,1.05])
    ax2.set_ylim([-0.05,1.05])
    ax2.set_xlabel('False Positive Rate')
    ax2.set_ylabel('True Positive Rate')
    ax2.set_title('ROC Curve')

    #for w,k in zip([1, 13, 20 ],'bgr'): # 'bgrcmykw'):
    p,r,thresholds1 = precision_recall_curve(y_test,pred_prob)
    fpr, tpr, thresholds2 = roc_curve(y_test,pred_prob)
    ax1.plot(r,p)
    ax2.plot(fpr,tpr) 

    plt.show()
    

def evaluateClassifier(clf, X_train, y_train, X_test, y_test, threshold=None, plotROC=False):     

    y_test_dismiss = y_test.value_counts()[0]
    y_test_click = y_test.value_counts()[1]
    
    clf.fit(X_train,y_train)
    pred_proba = clf.predict_proba(X_test)[:,1]

    if threshold is None:
        pred = clf.predict(X_test)
    else:
        pred_mine = np.where(pred_proba > threshold, 1, 0)
        pred = pred_mine
        
    accuracy = accuracy_score(y_test, pred)
    report = precision_recall_fscore_support(y_test, pred, average=None, labels=[0, 1])
    # We should pass the probability estimates of the positive class as the second parameter to roc curve
    rocScore = roc_auc_score(y_test, pred_proba)
        
    precision, recall, thresholds = precision_recall_curve(y_test, pred_proba)
    aucScore = auc(recall, precision)

    #PlotConfusionMatrix(y_test,pred,y_test_dismiss,y_test_click) 
    #input()         
        
    if(plotROC):
        plotCurves(y_test, pred_proba)
    
    accuracy = float("{:.2f}".format(accuracy))
    rocScore = float("{:.2f}".format(rocScore))
    aucScore = float("{:.2f}".format(aucScore))
    precisions = report[0] 
    recalls = report[1] 
    fscores = report[2]
    supports = report[3]
        
    return (accuracy, rocScore, aucScore, np.around(precisions, 3), 
            np.around(recalls, 2), np.around(fscores, 2), np.around(supports, 2))     


In [4]:
def selectImportantApps(appList, df, numMostCommonApps = 30, numAppsToSelect = 6):
    from sklearn.feature_selection import SelectKBest, mutual_info_classif, SelectPercentile, f_classif, chi2
    
    notifApplistMatrix = appList[df['mask'],:numMostCommonApps] 
    X = pd.DataFrame(df['mask'])
    for i in range(notifApplistMatrix.shape[1]):
        colName = 'app' + str(i)
        appArray = np.squeeze(np.asarray(notifApplistMatrix[:, i].todense()))
        X[colName] = appArray
    X.drop('mask', axis=1, inplace=True)
    y = df['interaction'] 
    
    selector = SelectPercentile(f_classif, percentile=20)
    selector.fit(X, y)
    scores = selector.scores_
    
    selectedAppNumbers = scores.argsort()[-numAppsToSelect:][::-1]
    return selectedAppNumbers

In [5]:
def addAppsToDF(df, appList, selectedAppNumbers):
    notifApplistMatrix = appList[df['mask'], :]
    notifApplistMatrix = notifApplistMatrix[:, selectedAppNumbers]
    for i in range(len(selectedAppNumbers)):
        colName = 'app' + str(selectedAppNumbers[i])
        appArray = np.squeeze(np.asarray(notifApplistMatrix[:, i].todense()))
        df[colName] = appArray

In [6]:
def getNotifsIdsTrainTestSplit():
    FILE = "notifsIds_split_train_test.npy"
    notifsIds = np.load(FILE)
    trainNotifsIds= notifsIds[0]
    testNotifsIds= notifsIds[1]
    return trainNotifsIds, testNotifsIds

In [7]:
#notifId = 1386267
df0 = pd.read_csv('./df0.csv')
df1= pd.read_csv('./df1.csv')
df2= pd.read_csv('./df2.csv')
df3= pd.read_csv('./df3.csv')
df4= pd.read_csv('./df4.csv')
df = pd.concat([df0, df1, df2, df3, df4])
#df = df[df['wrapper_id'] == notifId]  
df.drop(['Unnamed: 0', 'weekday', 'time', 'mainState', 'phoneModel'], axis=1, inplace=True)

In [8]:
df.columns

Index(['mask', 'wrapper_id', 'interaction', 'numApps'], dtype='object')

In [9]:
appList = load_sparse_csr("../data/app_list_csr.npz")

In [10]:
#condition = appList.sum(axis=0) > 400000
#condition

__ Few investigation shows that AppList is sorted (decreasing order) based on the number of users.
==> So, we just consider first 30 applications __

#selectedAppNumbers = [11, 18, 16, 20, 6, 3]
selectedAppNumbers = selectImportantApps(appList=appList, df = df, numMostCommonApps = 30, numAppsToSelect = 6)
addAppsToDF(df=df, appList=appList, selectedAppNumbers=selectedAppNumbers)

In [11]:
svd = TruncatedSVD(n_components=18, random_state=1)
svdColumns = svd.fit_transform(appList)
notifSVD = svdColumns[df['mask'], :]
for i in range(notifSVD.shape[1]):
    colName = 'svd' + str(i)
    df[colName] = notifSVD[:, i]
    #df[colName].fillna(0,inplace=True)

## LogisticRegression classifier


In [12]:
trainNotifsIds, testNotifsIds = getNotifsIdsTrainTestSplit()
trainMask = np.in1d(df['wrapper_id'], trainNotifsIds)
testMask = np.in1d(df['wrapper_id'], testNotifsIds)
X = df.drop('numApps', axis=1)
y = df['interaction']
X_train, y_train = X[trainMask], y[trainMask]
X_test, y_test = X[testMask], y[testMask] 

In [13]:
mask_numClicks_series = X_train.groupby('mask')['interaction'].sum() 
mask_numNotifs_series = X_train.groupby('mask')['interaction'].count() 
mask_clickRatio_series = (1.0 * mask_numClicks_series) / mask_numNotifs_series

X_train['clickRatio'] = 0
X_train['clickRatio'] = X_train['mask'].apply(lambda mask: mask_clickRatio_series.loc[mask])
X_test['clickRatio'] = 0
X_test['clickRatio'] = X_test['mask'].apply(lambda mask: mask_clickRatio_series.loc[mask] 
                                            if mask in mask_clickRatio_series.index else 0)
X_train.drop(['mask', 'wrapper_id', 'interaction'], inplace=True, axis=1)
X_test.drop(['mask', 'wrapper_id', 'interaction'], inplace=True, axis=1)
del df, X, y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus

#### Build model just by features of appList, without considering clickRatio

In [19]:
for weight in [13, 14, 15]:
    print("**************** weight = %f ****************" %weight)
    clf = LogisticRegression(class_weight={0:1, 1:weight}, n_jobs = 4)
    result = (accuracy, rocScore, aucScore, precisions, recalls, fscores, supports) = evaluateClassifier(clf, 
                X_train.drop('clickRatio', axis=1), y_train, X_test.drop('clickRatio', axis=1), y_test) 
    print(result[:5]) 

**************** weight = 13.000000 ****************
(0.57, 0.63, 0.1, array([ 0.956,  0.088]), array([ 0.57,  0.61]))
**************** weight = 14.000000 ****************
(0.51, 0.63, 0.1, array([ 0.959,  0.084]), array([ 0.5 ,  0.68]))
**************** weight = 15.000000 ****************
(0.46, 0.63, 0.1, array([ 0.961,  0.081]), array([ 0.45,  0.73]))


#### Build model just by considering clickRatio

In [15]:
for weight in [50, 60, 70, 80, 90, 100]:
    print("**************** weight = %f ****************" %weight)
    clf = LogisticRegression(class_weight={0:1, 1:weight}, n_jobs = 4)
    result = (accuracy, rocScore, aucScore, precisions, recalls, fscores, supports) = evaluateClassifier(clf, 
                    pd.DataFrame(X_train['clickRatio']), y_train, pd.DataFrame(X_test['clickRatio']), y_test) 
    print(result[:5]) 

**************** weight = 50.000000 ****************
(0.64, 0.67, 0.24, array([ 0.96 ,  0.101]), array([ 0.64,  0.6 ]))
**************** weight = 60.000000 ****************
(0.59, 0.67, 0.24, array([ 0.959,  0.092]), array([ 0.58,  0.63]))
**************** weight = 70.000000 ****************
(0.56, 0.67, 0.24, array([ 0.958,  0.088]), array([ 0.55,  0.64]))
**************** weight = 80.000000 ****************
(0.53, 0.67, 0.24, array([ 0.957,  0.084]), array([ 0.52,  0.65]))
**************** weight = 90.000000 ****************
(0.53, 0.67, 0.24, array([ 0.957,  0.084]), array([ 0.52,  0.65]))
**************** weight = 100.000000 ****************
(0.53, 0.67, 0.24, array([ 0.957,  0.084]), array([ 0.52,  0.65]))


#### Build model by considering both appList features and clickRatio

In [16]:
for weight in [50, 60, 70, 80, 90, 100]:
    print("**************** weight = %f ****************" %weight)
    clf = LogisticRegression(class_weight={0:1, 1:weight}, n_jobs = 4)
    result = (accuracy, rocScore, aucScore, precisions, recalls, fscores, supports) = evaluateClassifier(clf, 
                X_train, y_train, X_test, y_test) 
    print(result[:5]) 

**************** weight = 50.000000 ****************
(0.64, 0.68, 0.22, array([ 0.96 ,  0.102]), array([ 0.64,  0.6 ]))
**************** weight = 60.000000 ****************
(0.59, 0.68, 0.22, array([ 0.959,  0.093]), array([ 0.59,  0.63]))
**************** weight = 70.000000 ****************
(0.55, 0.68, 0.22, array([ 0.958,  0.087]), array([ 0.55,  0.64]))
**************** weight = 80.000000 ****************
(0.53, 0.68, 0.22, array([ 0.957,  0.084]), array([ 0.52,  0.65]))
**************** weight = 90.000000 ****************
(0.53, 0.68, 0.22, array([ 0.957,  0.084]), array([ 0.52,  0.65]))
**************** weight = 100.000000 ****************
(0.52, 0.68, 0.22, array([ 0.957,  0.083]), array([ 0.51,  0.65]))


In [21]:
for weight in [120, 150]:
    print("**************** weight = %f ****************" %weight)
    clf = LogisticRegression(class_weight={0:1, 1:weight}, n_jobs = 4)
    result = (accuracy, rocScore, aucScore, precisions, recalls, fscores, supports) = evaluateClassifier(clf, 
                X_train, y_train, X_test, y_test) 
    print(result[:5]) 

**************** weight = 120.000000 ****************
(0.34, 0.68, 0.22, array([ 0.961,  0.074]), array([ 0.31,  0.81]))
**************** weight = 150.000000 ****************
(0.08, 0.67, 0.22, array([ 0.98 ,  0.064]), array([ 0.02,  1.  ]))


__ Now we check if the improvement is just because of the clickRatio or the features of the appList are effective, too __

__ Here we check the clickRatio of test data to see if the click precision of model is better than this base case (sending notifs to everyone) or not; it is 0.063 __

In [18]:
(y_test == 1).sum() / len(y_test)

0.063154151693838131

## RandomForestClassifier

for max_depth in [3, 5, 8, 11, 20]:
    print("-----------------------------------------------------------------")
    for max_features in [3, 5, 20, X.shape[1]]:
        #print("**************** weight = %d ****************" %weight)
        print("**************** max_depth = %d, max_features=%s****************" %(max_depth, str(max_features)))
        clf = RandomForestClassifier(class_weight={0:1, 1:7}, n_jobs = 4, n_estimators=80, max_depth=max_depth, max_features=max_features) 
        result = (accuracy, rocScore, aucScore, precisions, recalls, fscores, supports) = evaluateClassifier(clf, X, y) 
        print(result[:5])

#X = X.drop('numApps', axis=1)
max_depth = 8
max_features = 5
clf = RandomForestClassifier(class_weight={0:1, 1:7}, n_jobs = 4, n_estimators=100, max_depth=max_depth, max_features=max_features) 
result = (accuracy, rocScore, aucScore, precisions, recalls, fscores, supports) = evaluateClassifier(clf, X, y) 
print(result[:5])
print(clf.feature_importances_)

X.columns

X['numApps'] = df['numApps']
max_depth = 8
max_features = 5
clf = RandomForestClassifier(class_weight={0:1, 1:7.2}, n_jobs = 4, n_estimators=100, max_depth=max_depth, max_features=max_features) 
result = (accuracy, rocScore, aucScore, precisions, recalls, fscores, supports) = evaluateClassifier(clf, X, y) 
print(result[:5])
print(clf.feature_importances_)