In [27]:
import pandas as pd 
import numpy as np
import os 
import json
from math import ceil, floor

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest,chi2

from sklearn.preprocessing import LabelEncoder,scale
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.utils.multiclass import unique_labels

from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 11})

import re
import nltk
from nltk.corpus import stopwords


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Activation, Conv1D, Dense, Embedding, Flatten, Input, Dropout, GlobalMaxPooling1D
from keras.metrics import categorical_accuracy
from keras.callbacks import  EarlyStopping


In [28]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;-]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
Number_RE = re.compile('[*^0-9]')
Bad_underline = re.compile('[*_*]')
RemoveTag = re.compile('&lt;|br&gt;|b&gt;|ul&gt;|li&gt;')

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = RemoveTag.sub('',text)
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = Number_RE.sub(' ', text) # replace Number symbols by space in text
    text = Bad_underline.sub(' ', text) # replace Underline symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
RemoveLastSpace = re.compile(' $')

def clean_text_category(text):
    text = RemoveLastSpace.sub('',text)
    return text
    

In [29]:
def classNumberThreshold(arr):
    dropCategory = []

    for key,value in arr.items():
        if(value<=30):
            dropCategory.append(key)
    return dropCategory

In [None]:
df = pd.read_csv('../../example/fliptkart.csv')

In [None]:
df = df[pd.notnull(df['description'])]
df.description = df.description.apply(clean_text)
df = df[pd.notnull(df['category_main'])]
df = df[pd.notnull(df['pid'])]



In [None]:
dropCategoryCode  = classNumberThreshold(df.category_main.value_counts())
dropSubCategoryCode  = classNumberThreshold(df.category_sub1.value_counts())

for i in dropCategoryCode:
    df = df[df.category_main!=i]
    
for i in dropSubCategoryCode:
    df = df[df.category_sub1!=i]


In [None]:
df.info()

In [None]:
le = LabelEncoder()
le.fit(df.category_main)
target = le.classes_
labels = le.transform(df.category_main)

le.fit(df.category_sub1)
subtarget = le.classes_
sublabels = le.transform(df.category_sub1)

In [None]:
tfidfconverter = TfidfVectorizer(min_df=5, max_df=0.7)
X = tfidfconverter.fit_transform(df.description)
featureNames = tfidfconverter.get_feature_names()

In [None]:
X_train, X_test, y_train_info, y_test_info = train_test_split(X, pd.DataFrame({'index':df.index, 'label':labels}), 
                                                    test_size=0.1, random_state = 27)

In [None]:
y_train = y_train_info.label
y_test = y_test_info.label


In [None]:
model = LinearSVC(random_state=42,class_weight="balanced")
model.fit(X_train,y_train)

In [None]:
BestSize  = [ele for ele in range(100, len(featureNames), 100) ]
Models = []
Scores = []
SelectModels  = []
for ele in range(100, len(featureNames), 100):
    print("--- Best "+ str(ele) + " features \n")
    selectBest = SelectKBest(chi2, k= ele)
    model = LinearSVC(random_state=42,class_weight="balanced")
    K_best_linearsvc = Pipeline([('SelectBest', selectBest), ('linearSVC', model)])
    K_best_linearsvc.fit(X_train,y_train)
    score = K_best_linearsvc.score(X_test,y_test)
    Models.append(K_best_linearsvc)
    Scores.append(score)

bestModel = Models[np.argmax(Scores)]
bestSize = BestSize[np.argmax(Scores)]

# save the model to disk
filename = 'Flipkart/'+str(bestSize)+'BestLinearSVC.sav'
joblib.dump(bestModel, filename)



yConfidence = bestModel.decision_function(X_test)

yPred = bestModel.predict(X_test)


In [None]:
selectBest = bestModel.get_params()['SelectBest']

originScore = selectBest.scores_
supportsList = selectBest.get_support()
Max = ceil(max(originScore))
Min = floor(min(originScore))
Normscores = [round((ele-Min)/(Max-Min),2) for ele in originScore]
scoreDict = dict(zip(featureNames,
                     list(zip(supportsList, Normscores))))
newScoreDict = {}

for key , item in scoreDict.items():
    if(item[0]):
        newScoreDict[key] = item[1]

ranksfeaturesDict = pd.DataFrame({"Feature":list(newScoreDict.keys()),
                                  "Score":list(newScoreDict.values())},dtype=np.int64)
ranksfeaturesDict = ranksfeaturesDict.sort_values(by=['Score'],ascending=False)
ranksfeaturesDict.to_csv("Flipkart/RankByTotalFeature.csv",index=False,compression=None)

In [None]:
BestModelsupportsList = bestModel.get_params()['SelectBest'].get_support()

NewFeatureName = []
index = 0
for ele in BestModelsupportsList:
    if(ele):
        NewFeatureName.append(featureNames[index])
    index = index+1

In [None]:
classWithInfluence = {}

coef = bestModel.get_params()['linearSVC'].coef_
coef = scale(coef)

coefTotalSize = coef.shape[0]*coef.shape[1]
coef1DArr = coef.reshape(coefTotalSize,1)
Max = ceil(max(coef1DArr[0]))
Min = floor(min(coef1DArr[0]))

for index in range(0,len(coef)):
    normalisedCoef = [round((ele-Min)/(Max-Min),2) for ele in coef[index]]
    name = list(NewFeatureName)
    classWithInfluence[target[index]] = list(zip(name,normalisedCoef))
            
Top10InflunceFeature = {}
for key, value in classWithInfluence.items():
    Top10InflunceFeature[key] = sorted(value, key=lambda x: x[1], reverse=True)[:10]

with open('Flipkart/CompanyTopFeatureByClass.json', 'w') as json_file:
    json.dump(Top10InflunceFeature, json_file, indent=2)    

In [None]:
totalConfidenceSize = yConfidence.shape[0]*yConfidence.shape[1]
Confidence1DArr = yConfidence.reshape(totalConfidenceSize,1)
Max = ceil(max(Confidence1DArr)[0])
Min = floor(min(Confidence1DArr)[0])
NormalisedConfid = [round((max(ele)-Min)/(Max-Min),2) for ele in yConfidence]
productID = [ df[df.index==ele]['pid'].values[0] for ele in y_test_info['index'] ]
ProductDescription =[ df[df.index==ele]['description'].values[0] for ele in y_test_info['index'] ]
LinearSVCResult = pd.DataFrame({'id':productID,'description':ProductDescription,
                         'trueClass': y_test,'trueClassNaem':target[y_test] ,
                         'predictClass':yPred,'predictClassName':target[yPred],
                         'Confidence':NormalisedConfid})
LinearSVCResult.to_csv("Flipkart/SVCResults.csv",index=False,compression=None)

In [None]:
def draw_confidenceReport(data):
    num = [ ele*0.01 for ele in range(20,81,2)]
    totalitems = []
    correct = []
    incorrect = []
    for ele in num:
        Threshold = data[data.Confidence>ele]
        correct.append(len(Threshold[Threshold['trueClass']==Threshold['predictClass']].index))
        incorrect.append(len(Threshold[Threshold['trueClass']!=Threshold['predictClass']].index))
        totalitems.append(len(Threshold.index))
    
    ClassfiedProportion = [ ele/len(data.index) for ele in totalitems]
    
    Acc =[]
    for ele in zip(correct,totalitems):
        if (ele[1]==0):
            Acc.append(0)
        else:
            Acc.append(ele[0]/ele[1])

    fig,ax  = plt.subplots()
    
    color = 'tab:blue'
    ax.scatter(num,ClassfiedProportion,label="Proportion classified",color=color)
    ax.set_xlabel("Confidence Score")
    ax.set_ylabel("Proportion classified", color=color)
    ax.xaxis.set_ticks(np.arange(0,1.1,0.1))
    ax.yaxis.set_ticks(np.arange(0,1.1,0.1))
    ax.tick_params(axis='y', labelcolor=color)

    color = 'tab:red'

    ax2 = ax.twinx()  # instantiate a second axes that shares the same x-axis

    ax2.scatter(num,Acc,label="Accuracy", color=color)
    ax2.set_ylabel("Accuracy", color=color)
    ax2.yaxis.set_ticks(np.arange(0,1.1,0.1))
    ax2.tick_params(axis='y', labelcolor=color)

    ax.grid()
    fig.savefig("Flipkart/ConfidenceScoreReportSVC.png")

In [None]:
print(classification_report(y_test, yPred,target_names=target))
with open('Flipkart/SVCReport.txt', 'w') as file:
    file.write(classification_report(y_test, yPred,target_names=target))

In [None]:
cm = confusion_matrix(y_test, yPred)

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
im =ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
ax.figure.colorbar(im, ax=ax)
classes = target[unique_labels(y_test, yPred)]
# We want to show all ticks...
ax.set(xticks=np.arange(cm.shape[1]),yticks=np.arange(cm.shape[0]),
       # ... and label them with the respective list entries
        xticklabels=classes, yticklabels=classes,ylabel='True label',xlabel='Predicted label')
# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")
fmt = 'd'
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(j, i, format(cm[i, j], fmt),ha="center", va="center", color="white" if cm[i, j] > thresh else "black")
fig.tight_layout()
fig.savefig("Flipkart/LinearSVCConfusionMatrix.png")

In [None]:
draw_confidenceReport(LinearSVCResult)

In [None]:
fig,ax  = plt.subplots()
ax.plot(BestSize,Scores,label="Linear SVC")
ax.set_xlabel("Number of best features")
ax.set_ylabel("Accuracy")
ax.legend()
ax.grid()
fig.savefig("Flipkart/linearsvcbestFeatureSize.png")