In [1]:
import pandas as pd
import numpy as np
import csv
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [2]:
dataset=pd.read_csv("Data/ITcomments.csv")
dataset.head()

Unnamed: 0,comment,Media Inputs,Navigation,APPs,Sound Quality,Audio System,Bluetooth,Voice Recognition,Audio Controls,Connectivity,...,AWD,Sun Visor,Tinted Windows,ABS,Sunroof,Amenities,IT,User Friendliness,ITS,EODO
0,I WOULD LIKE TO HAVE A STANDARD PLUG AVAILABLE...,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
1,TO ACTUALLY HAVE A NAVIGATION SYSTEM BUILT IN,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,THE NISSAN CONNECT APPS DO NOT WORK WELL AND T...,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,THE VEHICLE DOES NOT HAVE A NAVIGATION SYSTEM ...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,BEING ABLE TO TAKE THE BASS OUT OF THE DOOR SP...,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [3]:
dataset.describe()

Unnamed: 0,Media Inputs,Navigation,APPs,Sound Quality,Audio System,Bluetooth,Voice Recognition,Audio Controls,Connectivity,Technology,...,AWD,Sun Visor,Tinted Windows,ABS,Sunroof,Amenities,IT,User Friendliness,ITS,EODO
count,24191.0,24191.0,24191.0,24191.0,24191.0,24191.0,24191.0,24191.0,24191.0,24191.0,...,24191.0,24191.0,24191.0,24191.0,24191.0,24191.0,24191.0,24191.0,24191.0,24191.0
mean,0.048985,0.197801,0.044603,0.057253,0.093961,0.179488,0.107685,0.076599,0.018189,0.019594,...,0.000661,0.001612,0.001033,0.000165,0.000372,0.23579,0.461701,0.078376,0.146542,0.260469
std,0.215841,0.398349,0.206436,0.23233,0.29178,0.383769,0.309988,0.265959,0.133636,0.138604,...,0.02571,0.04012,0.032131,0.012858,0.019285,0.4245,0.498541,0.268768,0.353656,0.438899
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [25]:
commentVector=dataset["comment"].values
print(commentVector[:10])
textPipe=Pipeline([('vect',CountVectorizer(stop_words='english')),
                   ('tfidf',TfidfTransformer()),
                   ('clf',MultinomialNB())])

[ 'I WOULD LIKE TO HAVE A STANDARD PLUG AVAILABLE THAT WE COULD USE TO PLUG IN MISCELLANEOUS IPAD IPHONE TABLETS ETC '
 'TO ACTUALLY HAVE A NAVIGATION SYSTEM BUILT IN '
 'THE NISSAN CONNECT APPS DO NOT WORK WELL AND THERE ARE VERY FEW OF THEM IT S ALMOST A POINTLESS FEATURE '
 'THE VEHICLE DOES NOT HAVE A NAVIGATION SYSTEM THAT ALMOST SEEMS LIKE IT WOULD BE A STANDARD ITEM '
 'BEING ABLE TO TAKE THE BASS OUT OF THE DOOR SPEAKERS SINCE HAVING BASS GOING THROUGH THEM IS NOT GOOD FOR THE SPEAKER AND ALSO ADDS DISTORTION TO OVERALL SOUND QUALITY MY TRUCK HAS A SUBWOOFER BUT I HAVE NO OPTION TO TURN THE SUBWOOFER UP/DOWN '
 'WHEN I BOUGHT THIS TRUCK ALL I WANTED WAS A BASIC RADIO AND MAYBE 1 CD PLAYER I DID NOT WANT NOR NEED ALL OF THE BLUE TOOTH STUFF JUST WANT TO HEAR A LITTLE MUSIC I REALLY HATED PAYING FOR ALL OF THAT EXTRA COMPUTER STUFF WHEN ALL I NEEDED '
 'I TRULY DON T KNOW I LISTEN TO THE RADIO BUT DON T USE A CELL PHONE OR ANY ELECTRONIC GIZMOS WHILE IN THE CAR '
 'THE SOUND QUAL

In [26]:
def balanced_subsample(x,y,subsample_size=1.0):

    class_xs = []
    min_elems = None

    for yi in np.unique(y):
        elems = x[(y == yi)]
        class_xs.append((yi, elems))
        if min_elems == None or elems.shape[0] < min_elems:
            min_elems = elems.shape[0]

    use_elems = min_elems
    if subsample_size < 1:
        use_elems = int(min_elems*subsample_size)

    xs = []
    ys = []
    
    xtest=[]
    ytest=[]

    for ci,this_xs in class_xs:
        if len(this_xs) > use_elems:
            np.random.shuffle(this_xs)

        x_ = this_xs[:use_elems]
        y_ = np.empty(use_elems)
        y_.fill(ci)
        
        xs.append(x_)
        ys.append(y_)
        
        x_ = this_xs[use_elems:]
        y_ = np.empty(len(x_))
        y_.fill(ci)
        
        xtest.append(x_)
        ytest.append(y_)

    xs = np.concatenate(xs)
    ys = np.concatenate(ys)
    
    xtest = np.concatenate(xtest)
    ytest = np.concatenate(ytest)
    
    return xs,xtest,ys,ytest

In [27]:
#train Amenities classifier
y=dataset["Amenities"].values
commentTrain,commentTest,ytrain,ytest=balanced_subsample(commentVector,y,.4)
textPipe = textPipe.fit(commentTrain,ytrain)
print(textPipe.score(commentTest,ytest))

0.872535534159


In [28]:
#test with SVM classifier
commentTrain,commentTest,ytrain,ytest=balanced_subsample(commentVector,y,.2)
textPipe2=Pipeline([('vect',CountVectorizer(stop_words='english')),
                   ('tfidf',TfidfTransformer()),
                   ('clf',SGDClassifier(loss='hinge',penalty='l2',alpha=1e-3,n_iter=5))])
textPipe = textPipe.fit(commentTrain,ytrain)
print(textPipe.score(commentTest,ytest))

0.88092738807


In [29]:
#train IT classifier
y=dataset["IT"].values
commentTrain,commentTest,ytrain,ytest=balanced_subsample(commentVector,y,.2)
textPipe = textPipe.fit(commentTrain,ytrain)
print(textPipe.score(commentTest,ytest))

0.921368821293


In [30]:
#train User Friendliness classifier
y=dataset["User Friendliness"].values
commentTrain,commentTest,ytrain,ytest=balanced_subsample(commentVector,y,.5)
textPipe = textPipe.fit(commentTrain,ytrain)
print(textPipe.score(commentTest,ytest))

0.805382372729


In [31]:
#train ITS classifier
y=dataset["ITS"].values
commentTrain,commentTest,ytrain,ytest=balanced_subsample(commentVector,y,.6)
textPipe = textPipe.fit(commentTrain,ytrain)
print(textPipe.score(commentTest,ytest))

0.796809951347


In [32]:
#train EODO classifier
y=dataset["EODO"].values
commentTrain,commentTest,ytrain,ytest=balanced_subsample(commentVector,y,.5)
textPipe = textPipe.fit(commentTrain,ytrain)
print(textPipe.score(commentTest,ytest))

0.909563467665


In [33]:
rComments=["USB PORT STANDARD ON ALL MODELS ESPECIALLY ONE PLACED IN THE CONSOLE IMPROVING THE RADIO DISPLAY AND PHONE SOUND QUALITY","CONNECTIVITY WITH IPHONE IS INTERMITTENT WITH BLUETOOTH AND SPEAKERS ARE UNACCEPTABLE SPEAKERS SHOULD BE HIGHER QUALITY FOR THE PRICE OF THE VEHICLE"]
b = textPipe.predict(rComments)
print(b)

[ 0.  0.]


In [34]:
#train Navigation classifier
y=dataset["Navigation"].values
commentTrain,commentTest,ytrain,ytest=balanced_subsample(commentVector,y,.50)
textPipe = textPipe.fit(commentTrain,ytrain)
print(textPipe.score(commentTest,ytest))

0.830267429278


In [35]:
#train Navigation classifier
y=dataset["Sunroof"].values
commentTrain,commentTest,ytrain,ytest=balanced_subsample(commentVector,y,.50)
textPipe = textPipe.fit(commentTrain,ytrain)
print(textPipe.score(commentTest,ytest))

0.800603729893


In [37]:
#ynew=dataset.drop(["Consumer Reports"],1)
col=dataset.columns.values
catList=[]
data=dataset.values
for i in range(24191):
    l=[]
    for j in range(len(col)):
        if data[i,j]==1:
            l.append(col[j])
    catList.append(l)
print(catList[0:10])

[['Media Inputs', 'Amenities', 'IT'], ['Navigation', 'IT'], ['APPs', 'IT'], ['Navigation', 'IT'], ['Sound Quality', 'Amenities'], ['Audio System', 'Amenities'], ['Bluetooth', 'IT'], ['Sound Quality', 'Amenities'], ['Voice Recognition', 'IT'], ['APPs', 'IT']]


In [38]:
m=MultiLabelBinarizer()
d=m.fit_transform(catList)
print(d[:5])

[[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0
  0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0
  0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
  0 0 0 0 0]]


In [39]:

Xtrain, Xtest, ytrain, ytest=train_test_split(commentVector,d,test_size=.66)
textPipeMC=Pipeline([('vect',CountVectorizer()),
                   ('tfidf',TfidfTransformer(use_idf=True)),
                   ('clf',OneVsRestClassifier(SVC(kernel='linear',decision_function_shape=None)))])

textPipeMC.fit(Xtrain,ytrain)
print(textPipeMC.score(Xtest,ytest))

0.786622408718


In [41]:
allData=pd.read_csv("Data/allitverbatims.csv")
allData.head()

Unnamed: 0,comment
0,I DON T REALLY LIKE THE BUTTON/KNOB LAYOUT ON ...
1,CD PLAYER
2,A PILLAR IS HARD TO SEE AROUND AT AN INTERSEC...
3,A PILLARS OBSTRUCT SIDE VIEW A LITTLE TOO MUCH
4,AUTOMATIC HIGH/LOW BEAMS


In [42]:
allData=allData.dropna()
comments=allData["comment"].values
print(comments[:10])
preds=textPipeMC.predict(comments)

['I DON T REALLY LIKE THE BUTTON/KNOB LAYOUT ON THE CENTER CONSOLE '
 '  CD PLAYER ' ' A PILLAR IS HARD TO SEE AROUND AT AN INTERSECTION'
 ' A PILLARS OBSTRUCT SIDE VIEW A LITTLE TOO MUCH'
 ' AUTOMATIC HIGH/LOW BEAMS'
 ' BETTER VOICE CONTROL ONLINE NAVIGATION MAP UPDATES HAVING 4G INSTEAD OF 3G '
 ' CAR NEXT TO YOU LIGHT ON SIDE MIRROR HELPS WITH SIDE BLIND SPOTS'
 ' CONNECTED APPLICATIONS RUNNING ON THE PHONE ARE NOT PRACTICAL IT SHOULD NOT BE REQUIRED TO KEEP THE APPLICATION IN THE FOREGROUND ON THE PHONE WHY CAN T THE CAR READ RECEIVED TEXT OR ALLOW ME TO DICTATE NEW TEXTS? THIS SEEMS LIKE A MUST HAVE '
 ' FUN FACTOR'
 ' GOOD I VE BEEN TOLD BY FAMILY AND FRIENDS THE BACK SEATS GIVE THEM GOOD VISIBILITY OF THE OUTSIDE ']


In [None]:
print(preds[:10])

In [None]:
col=dataset.drop(["comment"],1).columns.values
predCatList=[]
for i in range(len(comments)):
    l=[]
    for j in range(43):
        if preds[i,j]==1:
            l.append(m.classes_[j])
    predCatList.append(l)
print(predCatList[0:10])

In [None]:
print(col)

In [None]:
print(comments[5223],predCatList[5223])

In [None]:
with open("outputcategories.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerows(predCatList)

with open("outputcomments.csv", "wb") as f:
    writer = csv.writer(f, dialect='excel', delimiter='\r')
    writer.writerow(np.transpose(comments).tolist())

In [None]:
print(len(comments))