In [1]:
import pandas as pd
import numpy as np
import csv
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import train_test_split
import gensim
from gensim import corpora, models, matutils
import re
from nltk.corpus import stopwords
from sklearn.metrics import precision_recall_fscore_support
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer

#Create an LSI vector creator with as many verbatims as possible

In [2]:
#Open full verbatim file
fullDataset=pd.read_csv("Data/allitverbatims.csv")
fullDataset.head()

Unnamed: 0,comment
0,I DON T REALLY LIKE THE BUTTON/KNOB LAYOUT ON ...
1,CD PLAYER
2,A PILLAR IS HARD TO SEE AROUND AT AN INTERSEC...
3,A PILLARS OBSTRUCT SIDE VIEW A LITTLE TOO MUCH
4,AUTOMATIC HIGH/LOW BEAMS


In [3]:
#tokenize each verbatim, remove stopword tokens and tokens used once in entire vocabulary, put list of tokens in texts
fullDocuments=fullDataset["comment"].values
print(fullDocuments[:3])
stoplist = set(stopwords.words("english"))

['I DON T REALLY LIKE THE BUTTON/KNOB LAYOUT ON THE CENTER CONSOLE '
 '  CD PLAYER ' ' A PILLAR IS HARD TO SEE AROUND AT AN INTERSECTION']


In [4]:
fullTexts = [[word for word in document.lower().split() if word not in stoplist] for document in fullDocuments]
#remove words used once
all_tokens = sum(fullTexts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
fullTexts = [[word for word in text if word not in tokens_once] for text in fullTexts]

In [5]:
#Build a dictionary - a frequency distribution of integer IDs representing words
dictionary = corpora.Dictionary(fullTexts)
#Build a vector space corpus - use the dictionary to translate
# word vectors into sparse feature vectors
fullCorpus = [dictionary.doc2bow(text) for text in fullTexts]
# Train a model that will "reward" tokens that are distinctive to documents - comment out to test non-tfIdf
tfidf = models.TfidfModel(fullCorpus)
# creates a lazy evaluating wrapper around corpus - doesn't transform the whole corpus
#  at once because we don't want to load the whole corpus into memory.
corpus_tfidf = tfidf[fullCorpus]

In [6]:
#create LSI model
lsi = models.LsiModel(fullCorpus, id2word=dictionary, num_topics=600)

Next, Create new corpus from known verbatims, train SVC on those LSI vectors

In [7]:
#Open known verbatim file and read
dataset=pd.read_csv("IT comments.csv")
dataset.head()

Unnamed: 0,comment,Media Inputs,Navigation,APPs,Sound Quality,Audio System,Bluetooth,Voice Recognition,Audio Controls,Connectivity,...,AWD,Sun Visor,Tinted Windows,ABS,Sunroof,Amenities,IT,User Friendliness,ITS,EODO
0,I WOULD LIKE TO HAVE A STANDARD PLUG AVAILABLE...,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
1,TO ACTUALLY HAVE A NAVIGATION SYSTEM BUILT IN,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,THE NISSAN CONNECT APPS DO NOT WORK WELL AND T...,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,THE VEHICLE DOES NOT HAVE A NAVIGATION SYSTEM ...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,BEING ABLE TO TAKE THE BASS OUT OF THE DOOR SP...,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [8]:
#create tokenized list of verbatims, just like above
trainingDocuments=dataset["comment"].values
stoplist = set(stopwords.words("english"))
trainingTexts = [[word for word in document.lower().split() if word not in stoplist] for document in trainingDocuments]
#remove words used once
all_tokens = sum(trainingTexts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
trainingTexts = [[word for word in text if word not in tokens_once] for text in trainingTexts]

In [9]:
#Build a dictionary - a frequency distribution of integer IDs representing words
trainingDictionary = corpora.Dictionary(trainingTexts)
#Build a vector space corpus - use the dictionary to translate
# word vectors into sparse feature vectors
trainingCorpus = [dictionary.doc2bow(text) for text in trainingTexts]

In [10]:
#convert new corpus into vectors
trainingCorpusLsi = lsi[trainingCorpus]
sparseTrainingLsi=matutils.corpus2csc(trainingCorpusLsi).transpose()

In [11]:
y=dataset["Amenities"].values
Xtrain, Xtest, ytrain, ytest=train_test_split(sparseTrainingLsi,y,test_size=.5,stratify=y)
classifier=LinearSVC(C=8.5)
classifier.fit(Xtrain,ytrain)
print("Accuracy: {}".format(classifier.score(Xtest,ytest)))
print(precision_recall_fscore_support(ytest, classifier.predict(Xtest)))

Accuracy: 0.925181878307
(array([ 0.94229341,  0.86474335]), array([ 0.96094764,  0.80925666]), array([ 0.95152911,  0.83608042]), array([9244, 2852]))


In [12]:
classifiers=[]
cols=dataset.columns.values
for c in cols[1:]:
    y=dataset[c].values
    Xtrain, Xtest, ytrain, ytest=train_test_split(sparseTrainingLsi,y,test_size=.5,stratify=y)
    classifier=LinearSVC(C=8.5)
    classifier.fit(Xtrain,ytrain)
    print("{} Accuracy: {}".format(c,classifier.score(Xtest,ytest)))
    print(precision_recall_fscore_support(ytest, classifier.predict(Xtest)))
    classifiers.append((c,classifier))

Media Inputs Accuracy: 0.976107804233
(array([ 0.99037957,  0.72960725]), array([ 0.98443884,  0.81450253]), array([ 0.98740027,  0.76972112]), array([11503,   593]))
Navigation Accuracy: 0.984292328042
(array([ 0.99081622,  0.95800416]), array([ 0.98959085,  0.96280819]), array([ 0.99020316,  0.96040017]), array([9703, 2393]))
APPs Accuracy: 0.989417989418
(array([ 0.99326657,  0.90234375]), array([ 0.99567324,  0.85555556]), array([ 0.99446845,  0.878327  ]), array([11556,   540]))
Sound Quality Accuracy: 0.972470238095
(array([ 0.98510079,  0.76239067]), array([ 0.98570552,  0.75468975]), array([ 0.98540306,  0.75852067]), array([11403,   693]))
Audio System Accuracy: 0.960648148148
(array([ 0.9770638 ,  0.79801623]), array([ 0.97956018,  0.77836412]), array([ 0.9783104 ,  0.78806768]), array([10959,  1137]))
Bluetooth Accuracy: 0.969246031746
(array([ 0.97846339,  0.92569806]), array([ 0.98418136,  0.9009673 ]), array([ 0.98131404,  0.91316527]), array([9925, 2171]))
Voice Recognit

  'precision', 'predicted', average, warn_for)


Distractions Accuracy: 0.997767857143
(array([ 0.99925472,  0.1       ]), array([ 0.99851055,  0.18181818]), array([ 0.9988825 ,  0.12903226]), array([12085,    11]))
Air Bags Accuracy: 0.996114417989
(array([ 0.99801028,  0.32352941]), array([ 0.99809303,  0.31428571]), array([ 0.99805165,  0.31884058]), array([12061,    35]))
Steering Wheel Accuracy: 0.999586640212
(array([ 0.99983459,  0.4       ]), array([ 0.9997519,  0.5      ]), array([ 0.99979324,  0.44444444]), array([12092,     4]))
Ergonomics Accuracy: 0.999586640212
(array([ 0.99966928,  0.        ]), array([ 0.9999173,  0.       ]), array([ 0.99979328,  0.        ]), array([12092,     4]))
Consumer Reports Accuracy: 0.999834656085
(array([ 0.99983466,  0.        ]), array([ 1.,  0.]), array([ 0.99991732,  0.        ]), array([12094,     2]))
Seat Accuracy: 0.999503968254
(array([ 0.99991725,  0.58333333]), array([ 0.99958637,  0.875     ]), array([ 0.99975178,  0.7       ]), array([12088,     8]))
Driving Position Accuracy:

Label the allitverbatims.csv file with the newly trained classifiers

In [13]:
#convert allitverbatims corpus into vectors
corpusLsi = lsi[fullCorpus]
sparseLsi=matutils.corpus2csc(corpusLsi).transpose()

In [14]:
labels=np.zeros((sparseLsi.shape[0]))
for c, classifier in classifiers:
    ypred=classifier.predict(sparseLsi)
    labels=np.column_stack((labels,ypred))

In [15]:
catList=[]
for i in range(labels.shape[0]):
    l=[]
    for j in range(len(cols)):
        if labels[i,j]==1:
            l.append(cols[j])
    catList.append(l)

In [16]:
for k in range(20):
    print(fullDocuments[k],catList[k])

('I DON T REALLY LIKE THE BUTTON/KNOB LAYOUT ON THE CENTER CONSOLE ', [])
('  CD PLAYER ', ['Audio System', 'Amenities'])
(' A PILLAR IS HARD TO SEE AROUND AT AN INTERSECTION', ['Navigation', 'Amenities'])
(' A PILLARS OBSTRUCT SIDE VIEW A LITTLE TOO MUCH', ['EODO'])
(' AUTOMATIC HIGH/LOW BEAMS', [])
(' BETTER VOICE CONTROL ONLINE NAVIGATION MAP UPDATES HAVING 4G INSTEAD OF 3G ', ['Navigation', 'Voice Recognition', 'Telematics', 'IT'])
(' CAR NEXT TO YOU LIGHT ON SIDE MIRROR HELPS WITH SIDE BLIND SPOTS', ['Visibility', 'EODO'])
(' CONNECTED APPLICATIONS RUNNING ON THE PHONE ARE NOT PRACTICAL IT SHOULD NOT BE REQUIRED TO KEEP THE APPLICATION IN THE FOREGROUND ON THE PHONE WHY CAN T THE CAR READ RECEIVED TEXT OR ALLOW ME TO DICTATE NEW TEXTS? THIS SEEMS LIKE A MUST HAVE ', ['Bluetooth', 'IT'])
(' FUN FACTOR', [])
(' GOOD I VE BEEN TOLD BY FAMILY AND FRIENDS THE BACK SEATS GIVE THEM GOOD VISIBILITY OF THE OUTSIDE ', ['Visibility', 'EODO'])
(' JERKY SHIFTING', [])
(' NAVIGATION W/ TRAFFIC 

In [None]:
a=zip(fullDocuments[:10],catList[:10])

r=[[j for j in i] for i in a]

In [None]:
with open("results/outputtest.csv", "wb") as f:
    writer = csv.writer(f, dialect='excel', delimiter='\r')
    writer.writerow(np.transpose(r).tolist())