In [1]:
import time
start = time.time()

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import seaborn as sns
import joblib
import sqlite3

import nltk
#GitHub needs nltk.download()
import random

#NLP
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# pd.set_option('max_colwidth', 2000)
# pd.options.display.max_rows = 500
# pd.options.display.max_columns = 500

In [2]:
def query_table_to_df(dbname_string, query):  
    '''Query sqlite database and return DataFrame.'''
    with sqlite3.connect(dbname_string) as conn: # autocommit mode
        return pd.read_sql(sql=query, con=conn, index_col='index')

In [3]:
#Load data
vehicle = joblib.load('../vehicle.joblib')  #contains sqlite-incompatible data
train=query_table_to_df('auto.sqlite','SELECT * from trainClean')
test=query_table_to_df('auto.sqlite','SELECT * from testClean')
train.index.name=None
test.index.name=None

In [4]:
print('vehicle row count:', len(vehicle))
vehicle.head(1)

vehicle row count: 3458


Unnamed: 0,URL_Vehicle,Title,Location,Year,Year_in_Title,Odometer,RawMake,Make,Model,Trim,...,ImageDictionary,VehicleID,VIN,Condition,TrimAlternate,DownFlag,LN_Price,Model_Trim,Model_Seller,Trim_Seller
1,https://austin.craigslist.org/cto/d/red-rock-2...,2006 Ford Focus SE,austin,2006,,181000.0,2006 ford focus se,ford,focus,se,...,{'0': 'https://images.craigslist.org/00n0n_dXT...,7022331083,None2,great,se,,8.006701,focus_se,focus_owner,se_owner


In [5]:
#stop words
stopword_list = nltk.corpus.stopwords.words('english')

In [6]:
#Instantiate CountVectorizer and fit.  Unigrams are (1,1), bigrams are (2,2).  See Feature Engineering p.46.

cv=CountVectorizer(lowercase=True, strip_accents='unicode', ngram_range=(1,1), stop_words=stopword_list)
wc_vector=cv.fit_transform(vehicle['Body'])
wc_vector.shape

(3458, 14013)

In [7]:
#View word count vector
wc_vector_dense = pd.DataFrame(wc_vector.todense(), index=vehicle.index, columns=cv.get_feature_names())
wc_vector_dense.head()

Unnamed: 0,00,000,0000,000688,000696,000697,000702,000703,000706,000lb,...,zopxhhkqe8sr,zp1629a,zp18843a,zp18944b,zr,zr1,zr17,zr19,zx2,zx4
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

#Get first 100 rows of Body series.
tokenized = vehicle['Body'].apply(word_tokenize)
tokenized.head(3)

1     [,, 2006, Ford, Focus, in, great, mechanical, ...
5     [,, ,, ,, address, :, ,, phone, :, ☎, ,, text,...
13    [,, Everything, works, ,, Turbo, diesel, ,, Ne...
Name: Body, dtype: object

In [9]:
#POS tag pandas series.  Then convert seriesPOS to table with one tuple (token, token_POS) per row.
seriesPOS = tokenized.apply(nltk.pos_tag)
adjectives = pd.DataFrame([tup for row in seriesPOS for tup in row if tup[1] == 'JJ'], columns=['tok','tok_pos'])

#Group combined on POS.
tok_piv = adjectives.groupby(['tok_pos','tok'])['tok'].agg({'tok':'count'})
tok_piv.nlargest(5, 'tok')

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  


Unnamed: 0_level_0,Unnamed: 1_level_0,tok
tok_pos,tok,Unnamed: 2_level_1
JJ,used,1743
JJ,new,1459
JJ,other,1235
JJ,Subject,1018
JJ,available,972


In [10]:
adjectives.drop_duplicates(subset=['tok'], inplace=True)
adjectives.shape

(2792, 2)

In [11]:
wcv = wc_vector_dense.transpose().reset_index()
wcv.rename(columns={"index": "token"}, inplace=True)
wcv.head()

Unnamed: 0,token,1,5,13,17,20,21,22,26,28,...,10502,10508,10509,10514,10523,10524,10527,10532,10535,10537
0,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,688,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,696,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
#Join adjectives and wcv on token.  Add row Total.
merged = adjectives.merge(wcv, how='inner', left_on=['tok'], right_on=['token'])
merged.drop(['tok'], axis='columns', inplace=True)
merged.loc[:, 'Total'] = merged.sum(axis=1)

In [19]:
vehicle.head(2)

Unnamed: 0,URL_Vehicle,Title,Location,Year,Year_in_Title,Odometer,RawMake,Make,Model,Trim,...,ImageDictionary,VehicleID,VIN,Condition,TrimAlternate,DownFlag,LN_Price,Model_Trim,Model_Seller,Trim_Seller
1,https://austin.craigslist.org/cto/d/red-rock-2...,2006 Ford Focus SE,austin,2006,,181000.0,2006 ford focus se,ford,focus,se,...,{'0': 'https://images.craigslist.org/00n0n_dXT...,7022331083,None2,great,se,,8.006701,focus_se,focus_owner,se_owner
5,https://austin.craigslist.org/ctd/d/lockhart-2...,2015 Ford Super Duty F-250 SRW 2WD Crew Cab 17...,austin,2015,,102467.0,2015 ford super duty f-250 srw,ford,f-250,super,...,{'0': 'https://images.craigslist.org/00808_7vQ...,7022290197,1FT7W2A69FEC27615,,super duty,,9.798071,f-250_super,f-250_dealer,super_dealer


In [33]:
#Sort by most frequent adjective tokens.  Original vehicle index is in column names.
merged.sort_values(ascending=False, by='Total').head(10)

Unnamed: 0,tok_pos,token,1,5,13,17,20,21,22,26,...,10508,10509,10514,10523,10524,10527,10532,10535,10537,Total
694,JJ,ford,1,3,0,3,10,1,1,1,...,9,2,1,1,1,1,9,0,1,7372
162,JJ,call,1,0,0,1,0,1,1,1,...,0,2,0,2,1,0,0,0,1,3642
18,JJ,new,0,0,2,1,1,0,1,0,...,1,0,0,0,0,0,1,0,0,3451
65,JJ,used,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,2575
193,JJ,f150,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,2511
336,JJ,diesel,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,2496
619,JJ,cab,0,0,0,2,0,0,0,0,...,0,0,0,0,1,0,0,0,1,2418
54,JJ,super,0,2,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,2316
183,JJ,front,0,2,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,2111
145,JJ,back,0,0,0,0,1,0,0,0,...,1,0,0,0,1,0,1,0,1,1968


In [32]:
pd.set_option('max_colwidth', 9000)
vehicle.loc[[5], ['Body','Model']]

Unnamed: 0,Body,Model
5,"\n ,\n,\n,\naddress: ,phone: ☎ ,text: , ,\nlink: , ,\ncontact: ,\n\n,\n,\n,after hours please text 512-761-7442,\nXL trim, Oxford White exterior and Steel interior. CARFAX 1-Owner. Flex Fuel, Tow Hitch, Edmunds.com's review says Compared to the competition, the 2015 Ford F-250 Super Duty line is remarkably quiet at highway speeds.. ,KEY FEATURES INCLUDE,Flex Fuel, Trailer Hitch. Ford XL with Oxford White exterior and Steel interior features a 8 Cylinder Engine with 385 HP at 5500 RPM*. ,EXPERTS RAVE,Edmunds.com explains Compared to the competition, the 2015 Ford F-250 Super Duty line is remarkably quiet at highway speeds.. ,WHO WE ARE,Here at Chuck Nash we do things a little different. We have been a family owned and operated dealership for over 38 years, and treat our customers as part of that family. We don't believe in pressure sales; but instead we hold the belief that if we treat our customers as we would our family, then we can create a relationship with a customer for life. We just don't want to put you into the car of your dreams and send you on your way, but we want to take care of you, and your vehicle, for years to ,Horsepower calculations based on trim engine configuration. Please confirm the accuracy of the included equipment by calling us prior to purchase. ,4-Wheel ABS., 4-Wheel Disc Brakes., 6-Speed A/T., ,8 Cylinder Engine., A/C., Adjustable Steering Wheel., ,AM/FM Stereo., Brake Assist., Driver Air Bag., ,Driver Lumbar., Electronic Stability Control., Flex Fuel., ,Front Tow Hooks., Full Size Spare Tire., Intermittent Wipers., ,Pass-Through Rear Seat., Passenger Air Bag., Passenger Vanity Mirror., ,Power Steering., Rear Head Air Bag., Rear Wheel Drive., ,Side Head Air Bag., Split Bench Seat., Steel Wheels., ,Tires - Front All-Season., Tires - Rear All-Season., Trailer Hitch., ,Vehicle Anti-Theft System., Vinyl Seats., ,Tire Pressure Monitoring System., Variable Speed Intermittent Wipers., ,\n,\naddress: ,phone: ☎ ,text: , ,\nlink: , ,\ncontact: ,\n\n,\n",f-250


### Something to read: A Practitioner's Guide to Natural Language Processing
###### https://towardsdatascience.com/a-practitioners-guide-to-natural-language-processing-part-i-processing-understanding-text-9f4abfd13e72