In [1]:
import pandas as pd
import numpy as np
import csv
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


In [2]:
dataset=pd.read_csv("Data/TrainingData.csv")
dataset.head()

Unnamed: 0,ModelYear,VerbatimComment,Tone,CustomerComplaint
0,2011,# & PLACE OF AREAS T PLUG IN TO CHARGE THINGS ...,N,Audio Format Compatibility
1,2011,# 12 SUV DEMANDS A MORE POWERFUL ENGINE,N,Acceleration from a Stop
2,2011,# 16 HEATING IT JUST STAYS TOO COLD WHERE IS T...,N,Heating / Cooling Performance
3,2011,"# 17 & # 18 AGILITY & FUN DRIVING PEP TURNING,...",P,General Dynamic Performance
4,2011,# 17 ENGINE/TRANSMISSION MPG / PERFORMANCE SMO...,P,Fuel Economy


In [3]:
commentVector=dataset["VerbatimComment"].values
print(commentVector[:10])

['# & PLACE OF AREAS T PLUG IN TO CHARGE THINGS GPS IPOD ETC.'
 '# 12 SUV DEMANDS A MORE POWERFUL ENGINE'
 '# 16 HEATING IT JUST STAYS TOO COLD WHERE IS THE HEAT.'
 '# 17 & # 18 AGILITY & FUN DRIVING PEP TURNING, SOLID PRODUCING CONFIDENCE'
 '# 17 ENGINE/TRANSMISSION MPG / PERFORMANCE SMOOTHNESS / POWER / MANUAL'
 '# 18 CARGO SPACE' '# 18 STORAGE AND SPACE'
 '# 18CENTER CONSOLE IS AWFUL SMALL DARK TOO FAR IMPRACTABLE'
 '# 19 I NAVIGATION IS NICE' '# 19 NAVIGATION SYSTEM IS AWESOME']


In [76]:
#test with SGD classifier
X=dataset.values[:,1]
y=dataset.values[:,3]
Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,stratify=y)
CategoryTextPipe=Pipeline([('vect',CountVectorizer(ngram_range=(1,2),stop_words='english')),
                   ('tfidf',TfidfTransformer()),
                   ('clf',SGDClassifier(loss='hinge', penalty='l2',alpha=1e-6, n_iter=4, random_state=42))])
textPipe = CategoryTextPipe.fit(Xtrain,ytrain)
print(CategoryTextPipe.score(Xtest,ytest))

0.814096499527


In [56]:
#test with Multinomial Naive Bayes classifier
X=dataset.values[:,1]
y=dataset.values[:,2]
Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,stratify=y)
SemanticTextPipe=Pipeline([('vect',CountVectorizer(ngram_range=(1,2),stop_words='english')),
                   ('tfidf',TfidfTransformer()),
                   ('clf',MultinomialNB(alpha=2e-4))])
textPipe = SemanticTextPipe.fit(Xtrain,ytrain)
print(SemanticTextPipe.score(Xtest,ytest))

0.851331260981


In [39]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3, 1e-4, 1e-5, 1e-6),}

In [40]:
gs_clf = GridSearchCV(CategoryTextPipe, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X[:400], y[:400])

In [46]:
gs_clf.best_score_
results=pd.DataFrame(gs_clf.cv_results_)
print(results.iloc[17,7])

{'vect__ngram_range': (1, 2), 'tfidf__use_idf': True, 'clf__alpha': 1e-06}


In [57]:
rComments=["USB PORT STANDARD ON ALL MODELS ESPECIALLY ONE PLACED IN THE CONSOLE IMPROVING THE RADIO DISPLAY AND PHONE SOUND QUALITY","CONNECTIVITY WITH IPHONE IS INTERMITTENT WITH BLUETOOTH AND SPEAKERS ARE UNACCEPTABLE SPEAKERS SHOULD BE HIGHER QUALITY FOR THE PRICE OF THE VEHICLE"]
b = CategoryTextPipe.predict(rComments)
print(b)

['Center Console' 'Bluetooth Connectivity']


In [86]:
ITDataset=pd.read_csv("Data/allitverbatims.csv")
ITDataset.head()

Unnamed: 0,comment
0,I DON T REALLY LIKE THE BUTTON/KNOB LAYOUT ON ...
1,CD PLAYER
2,A PILLAR IS HARD TO SEE AROUND AT AN INTERSEC...
3,A PILLARS OBSTRUCT SIDE VIEW A LITTLE TOO MUCH
4,AUTOMATIC HIGH/LOW BEAMS


In [87]:
itComments=ITDataset.values[:,0]
print(itComments[0])
predList=CategoryTextPipe.predict(itComments)
dfPredList=pd.DataFrame(predList)
dfPredList.head()

I DON T REALLY LIKE THE BUTTON/KNOB LAYOUT ON THE CENTER CONSOLE 


Unnamed: 0,0
0,Center Console
1,General Audio
2,Front Visibility
3,Front Visibility
4,Headlights


In [88]:
print(itComments[522],predList[522])
result=pd.concat([ITDataset, dfPredList], axis=1)
result.head()

('ABILITY TO SWITCH BETWEEN MUSIC AND PHONE FROM TOUCH BUTTON ON STEERING WHEEL WITHOUT REACHING OVER TO RADIO ABILITY TO CONTROL IPOD FROM STEERING WHEEL ', 'Steering Wheel Controls')


Unnamed: 0,comment,0
0,I DON T REALLY LIKE THE BUTTON/KNOB LAYOUT ON ...,Center Console
1,CD PLAYER,General Audio
2,A PILLAR IS HARD TO SEE AROUND AT AN INTERSEC...,Front Visibility
3,A PILLARS OBSTRUCT SIDE VIEW A LITTLE TOO MUCH,Front Visibility
4,AUTOMATIC HIGH/LOW BEAMS,Headlights


In [93]:
result.to_csv("Data/allitcomments_categorized.csv")

In [97]:
Xcat=dataset.values[:,1]
fulldataCategory=pd.DataFrame(CategoryTextPipe.predict(X))
fulldataSemantic=pd.DataFrame(SemanticTextPipe.predict(X))
full=pd.concat([dataset,fulldataCategory,fulldataSemantic],axis=1)
full.to_csv("Data/TrainingData_Categorized")

In [None]:
print(len(comments))