In [1]:
import pandas as pd
import numpy as np
import csv
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


In [8]:
dataset=pd.read_csv("Data/Catagorization training data.csv").dropna()
dataset.columns = ['Category','VerbatimComment','length']
dataset.head()

Unnamed: 0,Category,VerbatimComment,length
0,Acceleration from a Stop,# 12 SUV DEMANDS A MORE POWERFUL ENGINE,39
1,Heating / Cooling Performance,# 16 HEATING IT JUST STAYS TOO COLD WHERE IS T...,54
2,General Dynamic Performance,"# 17 & # 18 AGILITY & FUN DRIVING PEP TURNING,...",73
3,General Powertrain,# 17 ENGINE & TRANSMISSION,26
4,Quietness During Acceleration,# 17 ENGINE TRANS. - COARSE SOUNDING ON HARD A...,102


In [9]:
commentVector=dataset["VerbatimComment"].values
print(commentVector[:10])

['# 12 SUV DEMANDS A MORE POWERFUL ENGINE'
 '# 16 HEATING IT JUST STAYS TOO COLD WHERE IS THE HEAT.'
 '# 17 & # 18 AGILITY & FUN DRIVING PEP TURNING, SOLID PRODUCING CONFIDENCE'
 '# 17 ENGINE & TRANSMISSION'
 '# 17 ENGINE TRANS. - COARSE SOUNDING ON HARD ACCELERATION WOULD LIKE TO HAVE GDI FOR ADDED HORSE POWER'
 '# 17 ENGINE VERY DUICK AND CUT TRANSMISSION IS SMOOTH'
 '# 17 ENGINE/TRANSMISSION MPG / PERFORMANCE SMOOTHNESS / POWER / MANUAL'
 '# 17E LEATHER SEATS AND GREAT DASH BOARD'
 '# 18 - IT RIDES ROUGH EVEN ON THE HIGH WAY. I CAN HEAR THE ROUGHNESS.'
 '# 18 AMOUNT OF TRUNK SPACE - GOOD FOR A SMALL CAR']


In [34]:
#test with SGD classifier
X=dataset.values[:,1]
y=dataset.values[:,0]
Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,stratify=y)
CategoryTextPipe=Pipeline([('vect',CountVectorizer(ngram_range=(1,2),stop_words='english')),
                   ('tfidf',TfidfTransformer()),
                   ('clf',SGDClassifier(loss='hinge', penalty='l2',alpha=1e-6, n_jobs=-1, random_state=42))])
textPipe = CategoryTextPipe.fit(Xtrain,ytrain)
print(CategoryTextPipe.score(Xtest,ytest))

0.804260626285


In [47]:
import csv
import codecs
verbatims=[]
with codecs.open('Data/verbatims.csv', 'r', encoding='ascii', errors='ignore') as f:
    reader = csv.reader(f)
    for line in reader:
        verbatims.append(line[0])
        
verbatimsdf = pd.DataFrame(verbatims)

In [63]:
verbatimsdf.loc[6][0]

"I don't feel anything can be done about how I feel. It is a beautiful car. I just don't like some things about it. Wish I had researched more and comparing to my previous vehicle, I miss the car I traded in. I probably would not purchase this vehicle again."

In [57]:
v=CategoryTextPipe.predict(verbatimsdf.values.flatten())

In [65]:
pd.DataFrame(v).to_csv('Data/verbatimcat.csv', index=False)

In [37]:
#test with Multinomial Naive Bayes classifier
X=dataset.values[:,1]
y=dataset.values[:,0]
Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,stratify=y)
SemanticTextPipe=Pipeline([('vect',CountVectorizer(ngram_range=(1,2),stop_words='english')),
                   ('tfidf',TfidfTransformer()),
                   ('clf',MultinomialNB(alpha=1e-5))])
textPipe = SemanticTextPipe.fit(Xtrain,ytrain)
print(SemanticTextPipe.score(Xtest,ytest))

0.661779081134


In [13]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3, 1e-4, 1e-5, 1e-6),}

In [14]:
gs_clf = GridSearchCV(CategoryTextPipe, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X[:400], y[:400])



In [16]:
print(gs_clf.best_score_)
results=pd.DataFrame(gs_clf.cv_results_)
print(results.iloc[17,7])

0.5825
{'clf__alpha': 1e-06, 'vect__ngram_range': (1, 2), 'tfidf__use_idf': True}


In [17]:
rComments=["USB PORT STANDARD ON ALL MODELS ESPECIALLY ONE PLACED IN THE CONSOLE IMPROVING THE RADIO DISPLAY AND PHONE SOUND QUALITY","CONNECTIVITY WITH IPHONE IS INTERMITTENT WITH BLUETOOTH AND SPEAKERS ARE UNACCEPTABLE SPEAKERS SHOULD BE HIGHER QUALITY FOR THE PRICE OF THE VEHICLE"]
b = CategoryTextPipe.predict(rComments)
print(b)

['12V / USB / Aux Location / Quantity' 'Bluetooth Connectivity']


In [None]:
ITDataset=pd.read_csv("Data/allitverbatims.csv")
ITDataset.head()

In [None]:
itComments=ITDataset.values[:,0]
print(itComments[0])
predList=CategoryTextPipe.predict(itComments)
dfPredList=pd.DataFrame(predList)
dfPredList.head()

In [None]:
print(itComments[522],predList[522])
result=pd.concat([ITDataset, dfPredList], axis=1)
result.head()

In [None]:
result.to_csv("Data/allitcomments_categorized.csv")

In [None]:
Xcat=dataset.values[:,1]
fulldataCategory=pd.DataFrame(CategoryTextPipe.predict(X))
fulldataSemantic=pd.DataFrame(SemanticTextPipe.predict(X))
full=pd.concat([dataset,fulldataCategory,fulldataSemantic],axis=1)
full.to_csv("Data/TrainingData_Categorized")

In [None]:
print(len(comments))