In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,roc_auc_score,average_precision_score,confusion_matrix,cohen_kappa_score,classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib 

%matplotlib inline

pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
df = pd.read_csv("../data/20180827_Keyword_Classification.csv",encoding="utf-8")
df_labels=df.drop(columns=['Keyword','Unnamed: 1','Total tags'],axis=1)

df['google_classes']=df_labels.apply(lambda x: x.dropna().tolist(), axis=1)

df.rename(columns={'Unnamed: 1':'google_class','Keyword':'keyword'},inplace=True)
df = df[['keyword','google_classes']]
df.columns = ['keyword','google_class']

In [None]:
df.head()

In [None]:
#for seaborn plots only, for pandas plots, different
def save_plot(ax,plotname):
    fig = ax.get_figure()
    fig.savefig('../images/{0}'.format(plotname))

### replace "dimension / weight" to "dimension_weight"

In [None]:
df.google_class = df.google_class.astype("str")
df.google_class = df.google_class.apply(lambda x: x.replace(" / ","_").replace(" ","_"))

### Convert the google_class column to list of labels

In [None]:
df.head()

### Now convert the labels to one hot

In [None]:
mlb = MultiLabelBinarizer()
df2 = df.join(pd.DataFrame(mlb.fit_transform(df.pop('google_class')),
                          columns=mlb.classes_,
                          index=df.index))

In [None]:
df2.head()
df2.columns

### count the number of queries by label

In [None]:
df_count = df2.drop(['keyword'], axis=1)
counts = []
categories = list(df_count.columns.values)
for i in categories:
    counts.append((i, df_count[i].sum()))
df_stats = pd.DataFrame(counts, columns=['category', 'number_of_queries'])
df_stats.sort_values('number_of_queries',ascending=False)

In [None]:
df_stats.head()

In [None]:
top_intents = list(df_stats.sort_values('number_of_queries',ascending=False).head(30)['category'])
top_intents[:10]

### How many queries have multi-labels

In [None]:
rowsums = df2.iloc[:,2:].sum(axis=1)
x=rowsums.value_counts()
#plot
plt.figure(figsize=(8,5))
ax = sns.barplot(x.index, x.values)
plt.title("Multiple Categories Per Query")
plt.ylabel('# of Occurrences', fontsize=12)
plt.xlabel('# of Categories', fontsize=12)
save_plot(ax,'category_count.png')

### distribution of number of words by queries

In [None]:
lens = df2.keyword.str.len()
lens.hist(bins = np.arange(0,89,1))
fig = lens.hist(bins = np.arange(0,89,1))
plt.title('Word Count Distribution')
plt.ylabel('# of Queries', fontsize=12)
plt.xlabel('# of Words', fontsize=12)
save_plot(fig,'word_count_dist.png')

Most of the comments are within 50 characters.

### Now clean up the queries

In [None]:
#strip all punctuations and white spaces except dollar sign and hashtags
def clean_text(text):
    removelist = '$#'
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    #text = re.sub('\W', ' ', text)
    text = re.sub(r"[^\w"+removelist+"]", " ", text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [None]:
df2['keyword2'] = df2['keyword'].map(lambda x : clean_text(x))
df2.head()

### Split the training data into train and test set

In [None]:
df2.drop(columns=['keyword'],inplace=True)

#keyword2 column is on far right
df2.head()

# Export final DF

In [None]:
df2.to_csv('label_encoded_data.csv',index=False)

In [None]:
categories = df_stats.category.unique()
train, test = train_test_split(df2, random_state=42, test_size=0.33, shuffle=True)

X_train = train.keyword2
y_train = train.drop(columns= ['keyword2'])
X_test = test.keyword2
y_test = test.drop(columns = ['keyword2'])
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
df_stats.category.unique()

In [None]:
len(categories)

In [None]:
X_train.head()
X_test.head()
y_train.head()
y_test.head()

# Technique 1: OneVsRest multi-label strategy

The Multi-label algorithm accepts a binary mask over multiple labels. The result for each prediction will be an array of 0s and 1s marking which class labels apply to each row input sample.

Naive Bayes
OneVsRest strategy can be used for multi-label learning, where a classifier is used to predict multiple labels for instance. Naive Bayes supports multi-class, but we are in a multi-label scenario, therefore, we wrap Naive Bayes in the OneVsRestClassifier.

### Naive Bayes

In [None]:
# Define a pipeline combining a text feature extractor with multi lable classifier
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])

accuracy_result = []
roc_auc_result = []
ap_result = []
category_list = []

prediction = pd.DataFrame()
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    NB_pipeline.fit(X_train, y_train[category])
    # compute the testing accuracy
    prediction[category] = NB_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(y_test[category], prediction[category])))
#     print('Test ROC_AUC is {}'.format(roc_auc_score(y_test[category], prediction)))
    print('average_precision_score is {}'.format(average_precision_score(y_test[category], prediction[category])))
    print('cohen_kappa_score is {}'.format(cohen_kappa_score(y_test[category], prediction[category])))
    confusion_matrix(y_test[category], prediction[category])
    
    accuracy_result.append(accuracy_score(y_test[category], prediction[category]))
#     roc_auc_result.append(average_precision_score(y_test[category], prediction))
    ap_result.append(average_precision_score(y_test[category], prediction[category]))
    category_list.append(category)
    
df_result = pd.DataFrame(
    {'category': category_list,
     'accuracy': accuracy_result,
     'avg_precision_recall': ap_result
    })
df_result.head(10)
print(classification_report(y_test, prediction))

plt.savefig('../images/avg_pr_naive_bayes.png')

In [None]:
top_intents
y_test2

In [None]:
y_test[top_intents].values
y_test.shape
prediction[top_intents].values

In [None]:
y_test2 = y_test[top_intents].values.argmax(axis=1)
prediction2 = prediction[top_intents].values.argmax(axis=1)

conf_mat = confusion_matrix(y_test2, prediction2)
conf_mat
fig, ax = plt.subplots(figsize=(15,10))

sns.heatmap(conf_mat, annot=True, fmt='d')
#             xticklabels = top_intents, yticklabels=top_intents)

plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Model Confusion Matrix')
# plt.savefig("images/conf_matrix_linearsvc.png")
plt.show()

In [None]:
conf_mat.shape

In [None]:
print(classification_report(y_test, prediction,target_names=prediction.columns))

In [None]:
df_result2.head()

In [None]:
df_result2 = df_result[df_result['category'].isin(top_intents)][['avg_precision_recall','category']]
df_result2_full = df_result[['avg_precision_recall','category']]

# df_result2.plot(kind = 'barh',x = 'category',y='avg_precision_recall',figsize=(20,10),fontsize=14,legend=False)
ax = df_result2.plot(kind = 'barh',x = 'category',y='avg_precision_recall',figsize=(20,10),fontsize=14,legend=False)
fig = ax.get_figure()
fig.savefig('../images/avg_pr_naive_bayes.png')

### Linear SVC

### with Gridsearch

In [None]:
SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])

#Gridsearch
parameters = {'tfidf__ngram_range': [(1, 1), (1, 2),(1,3)],
              'tfidf__use_idf': (True, False),
              'tfidf__max_df': [0.25, 0.5, 0.75, 1.0],
              'tfidf__max_features': [10, 50, 100, 250, 500, 1000, None],
              'tfidf__stop_words': ('english', None),
              'tfidf__smooth_idf': (True, False),
              'tfidf__norm': ('l1', 'l2', None),
              }

grid = GridSearchCV(SVC_pipeline, parameters, cv=2, verbose=1)
grid.fit(X_train, y_train)

# SVC_pipeline.fit(X_train, y_train)
# prediction_test = SVC_pipeline_test.predict(X_test)


In [None]:
svc_pipeline = joblib.load('svc_pipeline.pickle')
svc_pipeline.steps

In [None]:
joblib.dump(grid.best_estimator_,'svc_pipeline.pickle',compress=1)

In [None]:
svc_pipeline = joblib.load('svc_pipeline.pickle')

p =svc_pipeline.predict([test_query])
print(p)
for idx,i in enumerate(p[0]):
    if i==1:
        print (categories[idx])
# prediction_test

In [None]:
test_query = 'itunes reset password'
svc_pipeline = joblib.load('svc_pipeline2.pickle')
p =svc_pipeline.predict([test_query])
print(p)
for idx,i in enumerate(p[0]):
    if i==1:
        print (categories[idx])
# prediction_test
    

In [None]:
len(prediction_test[0])

for i in prediction_test[:10]:
    print( i)

### Dont start the below cell. it has grid search. 

In [None]:

SVC_pipeline = Pipeline([
 ('tfidf',
 (TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           encoding='utf-8',
          lowercase=True, max_df=0.25, max_features=None, min_df=1,
          ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
          stop_words=None, strip_accents=None, sublinear_tf=False,
          token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=False,
          vocabulary=None))),
 ('clf',
  OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
       intercept_scaling=1, loss='squared_hinge', max_iter=1000,
       multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
       verbose=0), n_jobs=1))])

SVC_pipeline.fit(X_train, y_train)

joblib.dump(SVC_pipeline,'svc_pipeline2.pickle',compress=1)

In [None]:
# SVC_pipeline = Pipeline([
#                 ('tfidf', TfidfVectorizer(stop_words=stop_words)),
#                 ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
#             ])

SVC_pipeline = Pipeline([
 ('tfidf',
 (TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           encoding='utf-8',
          lowercase=True, max_df=0.25, max_features=None, min_df=1,
          ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
          stop_words=None, strip_accents=None, sublinear_tf=False,
          token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=False,
          vocabulary=None))),
 ('clf',
  OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
       intercept_scaling=1, loss='squared_hinge', max_iter=1000,
       multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
       verbose=0), n_jobs=1))])

SVC_pipeline.fit(X_train, y_train)
#Gridsearch
# parameters = {'tfidf__ngram_range': [(1, 1), (1, 2)],
#               'tfidf__use_idf': (True, False),
#               'tfidf__max_df': [0.25, 0.5, 0.75, 1.0],
#               'tfidf__max_features': [10, 50, 100, 250, 500, 1000, None],
#               'tfidf__stop_words': ('english', None),
#               'tfidf__smooth_idf': (True, False),
#               'tfidf__norm': ('l1', 'l2', None),
#               }

# grid = GridSearchCV(SVC_pipeline, parameters, cv=2, verbose=1)
# grid.fit(X_train, y_train)

prediction = pd.DataFrame()
accuracy_result = []
roc_auc_result = []
ap_result = []
category_list = []

for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    SVC_pipeline.fit(X_train, y_train[category])
    # compute the testing accuracy
    prediction[category] = SVC_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(y_test[category], prediction[category])))
    accuracy_result.append(accuracy_score(y_test[category], prediction[category]))
    ap_result.append(average_precision_score(y_test[category], prediction[category]))
    category_list.append(category)
    
df_result_svc = pd.DataFrame(
    {'category': category_list,
     'accuracy_svc': accuracy_result,
     'avg_precision_recall_svc': ap_result
    })
df_result_svc.head(10)

In [None]:
joblib.dump(SVC_pipeline,'svc_pipeline2.pickle',compress=1)

In [None]:
df_result_svc[['category','avg_precision_recall_svc']].dropna().to_csv('../data/avg_precision_recall_stats.csv',index=False)

In [None]:
df_result_svc[['category','avg_precision_recall_svc']].dropna()

In [None]:
df_result_svc2 = df_result_svc[df_result_svc['category'].isin(top_intents)][['avg_precision_recall_svc','category']]
df_result_svc2.plot(kind = 'barh',x = 'category',y='avg_precision_recall_svc',figsize=(20,10),fontsize=14)


### Testing SVC with real data:

In [None]:
svc_pipeline = joblib.load('svc_pipeline2.pickle')

In [None]:
# svc_pipeline
def return_class(test_query):
    list_of_predicted_intents = []
    for category in categories:
        SVC_pipeline.fit(X_train, train[category])
        predicted = SVC_pipeline.predict([test_query])
    #     print (predicted)
        if predicted[0] ==1:
            list_of_predicted_intents.append(category)
    return list_of_predicted_intents
        
            

In [None]:
test_query = 'itunes password reset'
for i in l:
    print( i)
    return_class(i)

### Logistic regression

In [None]:
l =list(X_test)
test_query = 'itunes password reset'
LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])
# for category in categories:
for category in ['nan','iTunes','iPod']:
#      print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    LogReg_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = LogReg_pipeline.predict([test_query])
    print (category)
    print(prediction) # for each category, the classifier will either print 1 or 0 to tell you that this query belongs to any of the class. 
#     print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    idx_list = []
    for idx,i in enumerate(prediction):
        if i ==1:
            idx_list.append(idx)

    for i in idx_list:
        print(l[i])
        print (len(y_test[category]))
        print (len((prediction)))
#     print (test[category])

In [None]:
LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])

accuracy_result = []
ap_result = []
category_list = []

prediction = pd.DataFrame()
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    LogReg_pipeline.fit(X_train, y_train[category])
    # compute the testing accuracy
    prediction[category] = LogReg_pipeline.predict(X_test)
    accuracy_result.append(accuracy_score(y_test[category], prediction[category]))
    ap_result.append(average_precision_score(y_test[category], prediction))
    category_list.append(category)
    
df_result_lg = pd.DataFrame(
    {'category': category_list,
     'accuracy_lg': accuracy_result,
     'avg_precision_recall_lg': ap_result
    })
df_result_lg.head(10)


In [None]:
df_result_lg2 = df_result_lg[df_result_lg['category'].isin(top_intents)][['avg_precision_recall_lg','category']]
df_result_lg2.plot(kind = 'barh',x = 'category',y='avg_precision_recall_lg',figsize=(20,10),fontsize=14)

### Merge 3 charts to visualize side by side

In [None]:
df_chart = pd.merge(df_result2,df_result_svc2, on='category')
df_chart2 = pd.merge(df_chart,df_result_lg2,on='category')
df_chart2.head()

In [None]:
df_chart2.to_csv("../data/avg_precision_recall_stats.csv",index=False)

In [None]:
# df_chart2.plot(kind='barh',x='category',figsize=(50,50),fontsize=40)
ax = df_chart2.plot(kind='barh',x='category',figsize=(50,50),fontsize=40)
fig = ax.get_figure()
plt.title("Comparison or Avg PR score for each model",fontsize=50)
plt.legend(fontsize=40) # using a size in points
fig.savefig('../images/comparison_pr_result.png')


In [None]:
df_chart2.category.unique()
df_chart2.head()

### Using Seaborn to plot comparison PR chart

In [None]:
df_melt = pd.melt(df_chart2, id_vars="category", var_name="model", value_name="avg_pr_score")

df_melt.head()

In [None]:
from matplotlib import pyplot
fig, ax = pyplot.subplots(figsize=(30,20))

current_palette = sns.color_palette("bright")
sns.set_palette(current_palette)
sns.set(font_scale=1)
sns.catplot(y='category', x='avg_pr_score', hue='model', data=df_melt, kind='bar',ax=ax,legend=False)
ax.legend(loc='lower right')


In [None]:
len(categories)
len(prediction)

In [None]:
df_raw = pd.read_csv("../data/20180827_Keyword_Classification.csv")
df_raw.head()
# df_raw.google_class = df_raw.google_class.astype(str)

# reset_test = test.reset_index()

In [None]:
df_test = pd.read_csv("../data/20180827_Keyword_Classification.csv",encoding="utf-8")
df_labels=df_test.drop(columns=['Keyword','Unnamed: 1','Total tags'],axis=1)

df_test['google_classes']=df_labels.apply(lambda x: x.dropna().tolist(), axis=1)

df_test.rename(columns={'Unnamed: 1':'google_class','Keyword':'keyword'},inplace=True)
# df = df[['keyword','google_classes']]
# df.columns = ['keyword','google_class']

df_test = df_test.reset_index()

In [None]:
df_test.rename(columns={'index':'idx'},inplace=True)
test_queries = df_test.keyword.tolist()[:10]
test_indexes = df_test.idx.tolist()[:10]
t = list(zip(test_queries,test_indexes))

In [None]:
df_test.head()

In [None]:
y_test_columns = pd.DataFrame(y_test.columns.tolist(),columns=['tag_name'])
y_test_columns

In [None]:
df_test.keyword.sample(n=100).to_csv("../data/test_input_file.csv",index=False)
y_test_columns.to_csv('../data/tag_list.csv',index=False)

In [None]:
svc_pipeline = joblib.load('svc_pipeline2.pickle')


for tup in t:
    print ("query : {0}".format(tup[0]))
    predicted = svc_pipeline.predict([tup[0]])
    print(predicted)
#     predicted
#     print (predicted)

#     predicted contains list of probabilities for each of the 140 class. You will set your own threshold.
#     Example: If > some_threshold then 1 else 0.
    
    predicted_list = []
    selected_categories = y_test.columns
    for i, p in enumerate(predicted[0]):
        if p==1:
            predicted_list.append(selected_categories[i])
    print( "predicted tags : {0}".format(predicted_list))
#     print("true tags : {0}".format(df_test.google_classes.iloc[tup[1]].split(",")))
    print("true tags : {0}".format(df_test.google_classes.iloc[tup[1]]))

    
    count = 0
    for i in predicted_list:
#         if i in df_test.google_classes.ix[tup[1]].split(",")
        if i in df_test.google_classes.ix[tup[1]]:
            count +=1
#     percent = round(count/len(df_test.google_classes.iloc[tup[1]].split(",")),2)*100
    percent = round(count/len(df_test.google_classes.iloc[tup[1]]),2)*100

    print("percentage of predicted in true tags: {0} %".format(str(percent)))
    print()
    print("****************************************")


