In [1]:
import pandas as pd 
import numpy as np
import sys
import sklearn
import nltk
import matplotlib.pyplot as plt
from sklearn import model_selection
from nltk.classify.scikitlearn import SklearnClassifier

In [2]:
#Importing Excel File as a DataFrame
file = 'data.xlsx'
data = pd.ExcelFile(file)
print(data.sheet_names)
df = data.parse('Combined File - 01.08.2019')
df.info()

['Combined File - 01.08.2019']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134756 entries, 0 to 134755
Data columns (total 7 columns):
Case Owner             134756 non-null object
Subject                134750 non-null object
Description            134720 non-null object
Date/Time Opened       134756 non-null datetime64[ns]
Closed                 134756 non-null int64
Queue in Salesforce    134756 non-null object
Unnamed: 6             206 non-null object
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 7.2+ MB


In [4]:
# Check all columns of DataFrame, drop unnecessary ones 
print(df.columns.values.tolist())
df.drop(['Date/Time Opened'], axis = 1, inplace = True )

['Case Owner', 'Subject', 'Description', 'Date/Time Opened', 'Closed', 'Queue in Salesforce', 'Unnamed: 6']


In [3]:
#checking for null values 
df.isnull().sum()
df["Subject"].fillna("No Subject", inplace = True)
df["Description"].fillna("No Description", inplace = True)
print(df.isnull().sum())


Case Owner                  0
Subject                     0
Description                 0
Date/Time Opened            0
Closed                      0
Queue in Salesforce         0
Unnamed: 6             134550
dtype: int64


In [4]:
print(df['Closed'].value_counts(dropna = False))
print(df['Case Owner'].value_counts(dropna = False))
df.describe()


1    132334
0      2422
Name: Closed, dtype: int64
Manoj Purohit                  18790
Himanshu Rajput                 5806
Sunny Thakkar                   5684
Priyanka Maharaj                5674
Mayur Saraswat                  5549
                               ...  
Nirav Solanki                      1
Venkata Mahendra Prattipati        1
Arun Sudershan                     1
Regan Curry                        1
Santhosh Babu Eranti               1
Name: Case Owner, Length: 163, dtype: int64


Unnamed: 0,Closed
count,134756.0
mean,0.982027
std,0.132855
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [5]:
from sklearn.preprocessing import LabelEncoder

# convert class labels to binary values

encoder = LabelEncoder()
y = encoder.fit_transform(df['Queue in Salesforce'])


In [6]:
# Replacing Email Adresses in all columns with a simple string
df['Subject'] = df['Subject'].str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailadd')
df['Description'] = df['Description'].str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailadd')



In [7]:
#Replacing URLs with a simple string 
df['Subject'] = df['Subject'].str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','url')
df['Description'] = df['Description'].str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','url')



In [8]:
#Replacing Numbers with a simple string numbr
df['Subject'] = df['Subject'].str.replace(r'\d+(\.\d+)?','numbr')
df['Description'] = df['Description'].str.replace(r'\d+(\.\d+)?','numbr')


In [9]:
#Removing Punctuation
df['Subject'] = df['Subject'].str.replace(r'[^\w\d\s]',' ')
df['Description'] = df['Description'].str.replace(r'[^\w\d\s]',' ')


In [10]:
# Removing Whitespace between terms with a single space
df['Subject'] = df['Subject'].str.replace(r'\s+',' ')
df['Description'] = df['Description'].str.replace(r'\s+',' ')

df['Case Owner'] = df['Case Owner'].str.replace(r'\s+',' ')


In [11]:
# Replace 10 Digit Phone Numbers with a single string 
df['Subject'] = df['Subject'].str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phoneno')
df['Description'] = df['Description'].str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phoneno')




In [12]:
# Change Words to LowerCase

df['Subject'] = df['Subject'].str.lower()
df['Description'] = df['Description'].str.lower()
df['Case Owner'] = df['Case Owner'].str.lower()


In [13]:
# Import StopWords, apply to each Column in order to filter from the same
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
df['Subject'] = df['Subject'].apply(lambda x: " ".join(x.lower() for x in str(x).split() \
                                    if x not in stop_words))
df['Description'] = df['Description'].apply(lambda x: " ".join(x.lower() for x in str(x).split() \
                                    if x not in stop_words))
df['Case Owner'] = df['Case Owner'].apply(lambda x: " ".join(x.lower() for x in str(x).split() \
                                    if x not in stop_words))

In [14]:
# Remove Word Stems using a PorterStemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()
df['Subject'] = df['Subject'].apply(lambda x: ' '.join(
    ps.stem(term) for term in str(x).split()))
df['Description']= df['Description'].apply(lambda x: ' '.join(
    ps.stem(term) for term in str(x).split()))

In [15]:
df['Queue in Salesforce'] = y
df.drop(['Unnamed: 6'], axis = 1, inplace = True)

In [16]:
df.drop(['Closed'],1,inplace = True)
X = pd.DataFrame(df.drop(['Queue in Salesforce'],1))
y = pd.DataFrame(y)


Feature Engineering


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Create feature vectors
vectorizer1 = TfidfVectorizer(min_df = 5,
                             sublinear_tf = True,
                             use_idf = True,
                             max_features = 5000)
vectorizer2 = TfidfVectorizer(min_df = 5,
                              sublinear_tf = True,
                              use_idf = True,
                              max_features = 1000)
feature_vector1 = vectorizer2.fit_transform(df['Subject'])
feature_vector2 = vectorizer1.fit_transform(df['Description'])
#feature_vector3 = vectorizer.fit_transform(df['Case Owner'])
a = feature_vector1.todense()
b = feature_vector2.todense()
#c = feature_vector3.todense()

In [18]:
#from nltk.tokenize import word_tokenize
#subject_words = []
#description_words = []
#for message in df['Subject'] :
    #words = word_tokenize(message)
    #for w in words :
        #subject_words.append(w)
#for message in df['Description'] :
    #words = word_tokenize(message)
    #for w in words :
        #description_words.append(w)
#subject_words = nltk.FreqDist(subject_words)
#description_words = nltk.FreqDist(description_words)


    

In [28]:
#subject_features = encoder.fit_transform(list(subject_words.keys())[:3000])
#description_features = encoder.fit_transform(list(description_words.keys())[:3000])



#def find_features(message) :
    #words = word_tokenize(message)
    #features = {}
    #for word in subject_features :
        #features1[word] = (word in words)
    #for word in description_features :
        #features2[word] = (word in words)
    #return features1,features2



In [18]:
df1 = pd.DataFrame(a)
df2 = pd.DataFrame(b)
#df3 = pd.DataFrame(c)

In [19]:
dfc = pd.concat([df1,df2],axis =1)

In [None]:
#dftrain = pd.concat([dfc,df3],axis = 1)

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dfc, y, test_size=0.15, random_state=3)


In [22]:
from sklearn.linear_model import LogisticRegression
#from sklearn.model_selection import GridSearchCV
# Create regularization penalty space
#penalty = ['l1', 'l2']

# Create regularization hyperparameter space
#C = np.logspace(0, 4, 10)

# Create hyperparameter options
#hyperparameters = dict(C=C, penalty=penalty)

In [23]:
LR = LogisticRegression(multi_class="multinomial",solver="lbfgs")

#LRcv=GridSearchCV(LR,hyperparameters,cv=5,verbose = 0)
LR.fit(X_train,y_train)
LR.predict(X_test)

  y = column_or_1d(y, warn=True)


array([7, 2, 7, ..., 4, 7, 2])

In [24]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

predict = LR.predict(X_test)
print('Classification Accuracy : ' + str(accuracy_score(y_test, predict)))
print(confusion_matrix(y_test,predict))
print(classification_report(y_test,predict))


Classification Accuracy : 0.8362026318393193
[[   5    0    4    0    0    0    0  118    0]
 [   0  614  419    3   59    0   22   17   43]
 [   1  161 5455   15   41    0  105  388  175]
 [   0    9   60  279  149    0   15   36    4]
 [   0   36   99   19 1415    0    2   16    1]
 [   0    0    4    0    1   12    0    2    1]
 [   0   42  347    7   12    0  216   31   18]
 [   1    2  342   14    8    0    4 8177    8]
 [   0   58  341    1   18    0   13   19  730]]
              precision    recall  f1-score   support

           0       0.71      0.04      0.07       127
           1       0.67      0.52      0.59      1177
           2       0.77      0.86      0.81      6341
           3       0.83      0.51      0.63       552
           4       0.83      0.89      0.86      1588
           5       1.00      0.60      0.75        20
           6       0.57      0.32      0.41       673
           7       0.93      0.96      0.94      8556
           8       0.74      0.62  

In [None]:
3from sklearn.svm import SVC

In [None]:
svc = SVC()
svc.fit(X_train,y_train)
preds = svc.predict(X_test)
print(accuracy_score(y_test, preds))
print(confusion_matrix(y_test,preds))
print(classification_report(y_test,preds))

In [None]:
from sklearn.linear_model import SGDClassifier
classifier = SGDClassifier(alpha=0.00001)
classifier.fit(X_train,y_train)
pred = classifier.predict(X_test)
print(accuracy_score(y_test, pred))
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

dftrain.shape

In [27]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
model = clf.fit(X_train,y_train)
pred = model.predict(X_test)
print('Classification Accuracy' + str(accuracy_score(y_test, pred)))
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

  y = column_or_1d(y, warn=True)


Classification Accuracy0.7770851884832295
[[  15    0   15    1    0    0    0   96    0]
 [   3  571  497    3   19    0   28    7   49]
 [   5  211 5308   49   18    7  202  279  262]
 [   1   16   92  258  134    0   30   15    6]
 [   9  100  246   11 1190    0   19    7    6]
 [   0    0   12    1    0    1    2    2    2]
 [   3   36  327   15    9    1  240   20   22]
 [ 141   17  801   70   11    0   40 7455   21]
 [   1   69  407    1    8    0   12   12  670]]
              precision    recall  f1-score   support

           0       0.08      0.12      0.10       127
           1       0.56      0.49      0.52      1177
           2       0.69      0.84      0.76      6341
           3       0.63      0.47      0.54       552
           4       0.86      0.75      0.80      1588
           5       0.11      0.05      0.07        20
           6       0.42      0.36      0.39       673
           7       0.94      0.87      0.91      8556
           8       0.65      0.57     

### 