In [5]:
import numpy as np
import pandas as pd
import re

In [22]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import  nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2

print("Done with import")
#To read data out of the test File

Done with import


In [10]:
df = pd.read_csv('data/train_data.csv')
df['text'].head()

0    Keep your gloves, hats, coats and jackets toge...
1    The Home Dynamix Serendipity Ivory 5 ft. 2 in....
2    The Bosch 18-Volt lithium-ion line of Cordless...
3    Restore your Porter-Cable sander or polisher t...
4    The SPIKECUBE Surge Suppressor from Tripp Lite...
Name: text, dtype: object

In [9]:
#To sort the dataframe according to the id given
sorted_data = df.sort_values(by=['id'],ascending=True)
print(sorted_data.head())


                                                    text      id
16914  Classic architecture meets contemporary design...  100003
14666  The Grape Solar 265-Watt Polycrystalline PV So...  100004
30728  Achieving delicious results is almost effortle...  100006
974    The Quantum Adjustable 2-Light LED Black Emerg...  100007
19358  The Teks #10 x 1-1/2 in. Zinc-Plated Steel Was...  100008


In [11]:
sorted_data = sorted_data.reset_index(drop=True)
print(sorted_data.head(10))


                                                text      id
0  Classic architecture meets contemporary design...  100003
1  The Grape Solar 265-Watt Polycrystalline PV So...  100004
2  Achieving delicious results is almost effortle...  100006
3  The Quantum Adjustable 2-Light LED Black Emerg...  100007
4  The Teks #10 x 1-1/2 in. Zinc-Plated Steel Was...  100008
5  Recycler 22 in. Personal Pace Variable Speed S...  100011
6  The 96 in. wide Caramel Simple Weave Rollup Ba...  100012
7  Backyard X-Scapes, Inc. uses the finest bamboo...  100015
8  Make grilling great with this handsome and fun...  100016
9  The MD Building Products 36 in. x 36 in. x 1/5...  100017


In [12]:
#To read Test labels
test_labels = pd.read_csv("data/train_label.csv")
print(test_labels.head())


       id                  label
0  100003                  Shape
1  100004        Voltage (volts)
2  100004        Wattage (watts)
3  100006        Wattage (watts)
4  100007  ENERGY STAR Certified


In [13]:
#To remove duplicate enteries
test_labels = test_labels.drop_duplicates(subset =['id'],\
                            keep = 'first')

In [14]:
print(test_labels.head(10))


        id                  label
0   100003                  Shape
1   100004        Voltage (volts)
3   100006        Wattage (watts)
4   100007  ENERGY STAR Certified
5   100008                 Finish
8   100011               Features
10  100012               Features
13  100015       Package Quantity
14  100016                  Shape
15  100017         Indoor/Outdoor


In [15]:
#To get the total class labels
class_labels =  list(test_labels['label'].unique())
print(class_labels)


['Shape', 'Voltage (volts)', 'Wattage (watts)', 'ENERGY STAR Certified', 'Finish', 'Features', 'Package Quantity', 'Indoor/Outdoor', 'Included', 'Color', 'Assembly Required', 'Tools Product Type', 'Hardware Included', 'Commercial / Residential', 'Flooring Product Type']


In [17]:
labels = { k:v for v ,k in enumerate(class_labels) }
labels

{'Shape': 0,
 'Voltage (volts)': 1,
 'Wattage (watts)': 2,
 'ENERGY STAR Certified': 3,
 'Finish': 4,
 'Features': 5,
 'Package Quantity': 6,
 'Indoor/Outdoor': 7,
 'Included': 8,
 'Color': 9,
 'Assembly Required': 10,
 'Tools Product Type': 11,
 'Hardware Included': 12,
 'Commercial / Residential': 13,
 'Flooring Product Type': 14}

In [18]:
#To change the strings with the mappings
test_labels = test_labels.replace({'label': labels})
print(test_labels.head(10))


        id  label
0   100003      0
1   100004      1
3   100006      2
4   100007      3
5   100008      4
8   100011      5
10  100012      5
13  100015      6
14  100016      0
15  100017      7


In [19]:
#Need to join these class labels so that only one data frame remains
test_labels.index = sorted_data.index
cleaned_data = sorted_data.join(test_labels['label'])
print(cleaned_data.head(10))

                                                text      id  label
0  Classic architecture meets contemporary design...  100003      0
1  The Grape Solar 265-Watt Polycrystalline PV So...  100004      1
2  Achieving delicious results is almost effortle...  100006      2
3  The Quantum Adjustable 2-Light LED Black Emerg...  100007      3
4  The Teks #10 x 1-1/2 in. Zinc-Plated Steel Was...  100008      4
5  Recycler 22 in. Personal Pace Variable Speed S...  100011      5
6  The 96 in. wide Caramel Simple Weave Rollup Ba...  100012      5
7  Backyard X-Scapes, Inc. uses the finest bamboo...  100015      6
8  Make grilling great with this handsome and fun...  100016      0
9  The MD Building Products 36 in. x 36 in. x 1/5...  100017      7


# preprocessing



In [25]:
stemmer = SnowballStemmer('english')
words = stopwords.words("english")

In [24]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yashwantjangid/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [26]:
cleaned_data['cleaned'] = cleaned_data['text'].apply( lambda x: \
                           " ".join([stemmer.stem(i) for i in \
                           re.sub("[^a-zA-Z]"," ",x).split() \
                           if i not in words]).lower())
print(cleaned_data['cleaned'].head(10))


0    classic architectur meet contemporari design e...
1    the grape solar watt polycrystallin pv solar p...
2    achiev delici result almost effortless whirlpo...
3    the quantum adjust light led black emerg light...
4    the tek x zinc plate steel washer head hex sel...
5    recycl person pace variabl speed self propel g...
6    the wide caramel simpl weav rollup bamboo shad...
7    backyard x scape inc use finest bamboo cane an...
8    make grill great handsom function grill gazebo...
9    the md build product x x aluminum cloverleaf n...
Name: cleaned, dtype: object


In [28]:
#To split into the train and test samples to get the accuracy of the model
# We split the data set into the 20% test size
X_train, X_test, y_train, y_test = train_test_split(cleaned_data['cleaned'],\
                            cleaned_data['label'],test_size = 0.2)
print(X_train.head())


26450    the everbilt heavi duti tee hing ideal surfac ...
14045    the h x x return air grill use sidewal ceil op...
13781    chamberlain garag door open strong durabl made...
35172    this split lock washer provid circular bear su...
12955    a meter breaker panel sometim call one combo u...
Name: cleaned, dtype: object


In [29]:
print(y_train.head())

26450    4
14045    9
13781    2
35172    6
12955    7
Name: label, dtype: int64


In [30]:
# Now to build model from here
"""We will make use of the three things in the model fitting
first--- to make a vectorization or the feature set using the continuous bag
         of words by Tf idf vectorization, and too by bigram with ngram rnage(1,2)
second-- Need to extract top 10000 features from the gigantic matrix created by the
         tfidfvectorization
third--- by the selected features learn SVC through it, lasso regularization is used
         for better performance"""

pipeline = Pipeline([('vect', TfidfVectorizer(ngram_range = (1,2), stop_words=\
                            "english", sublinear_tf = True)), \
                     ('chi', SelectKBest(chi2, k=10000)), \
                     ('clf', LinearSVC(C=1.0, penalty='l1', max_iter = 3000, \
                            dual=False))])

In [31]:
model = pipeline.fit(X_train,y_train)


In [32]:
# To extract top fetaures for each class
# its just for the visualization

# Need to search more about it
vectorizer = model.named_steps['vect']
chi = model.named_steps['chi']
clf = model.named_steps['clf']

In [33]:
feature_names = vectorizer.get_feature_names()
feature_names = [ feature_names[i] for i in chi.get_support(indices=True)]
feature_names = np.asarray(feature_names)
print(feature_names[:10])

['ab watersens' 'abil brown' 'abil oven' 'abl stop' 'abl surfac' 'abov'
 'abov counter' 'abov grade' 'abov gradeinstal' 'abras']


In [34]:
#Top 10 features or the keywords
target_names = [ str(i) for i in range(15)]
print("top 10 keywords per class")
for i, label in enumerate(target_names):
    top10 = np.argsort(clf.coef_[i])[-10:]
    print(" %s : %s "%(label," ".join(feature_names[top10])))



top 10 keywords per class
 0 : base set shower rod hot tub wringer rake gazebo mildew weather rais garden cap base net free 
 1 : bussmann retard shell deliveri check cook odor exid lenmar wire connector continu feed central vacuum bar chain 
 2 : cook microwav smoke steam sensor cook power level defrost speedcook turntabl watt cook photo eye 
 3 : fixtur year product humid freezer pendant insul water heater sconc tankless wash cycl 
 4 : mantel brass base rear mount schlage end panel door mould hardwar design water heater fara 
 5 : fit window window treatment weather strip low glass pelletv glass block truport tafco creosot halloween 
 6 : conduit red dot varianc buy guid superstrut halex window box superior grit raco carlon 
 7 : bait amp space extens cord dap antenna potabl kill readi hang recycl fee trash 
 8 : apron ship continent locker ft lb storag box installationarchitectur design zephyr peg construct wall plate help click 
 9 : return air breathabl heater behr coordin hardwo

In [35]:
print("accuracy score: "+str(model.score(X_test,y_test)))
print(model.predict(["These machine screw nuts are designed to be used with smaller machine screws (under 1/4 in.) and have a hex drive. Used for fastening to a screw when mechanically joining materials together. Must be used with like materials/sized screws. Available in various materials and finishes to suit your application.California residents: see&nbsp"]))



accuracy score: 0.8664902573971615
[4]


In [36]:
# Now to load test data set and get the final readings
Test_Set = pd.read_csv("data/test_data.csv")
Test_labels = model.predict(Test_Set['text'])
print(Test_labels[:10])


[ 4 11  6  9  9 11 12  7  7 13]


In [37]:
mapping = { k: v for k,v in enumerate(class_labels) }
print(mapping)

{0: 'Shape', 1: 'Voltage (volts)', 2: 'Wattage (watts)', 3: 'ENERGY STAR Certified', 4: 'Finish', 5: 'Features', 6: 'Package Quantity', 7: 'Indoor/Outdoor', 8: 'Included', 9: 'Color', 10: 'Assembly Required', 11: 'Tools Product Type', 12: 'Hardware Included', 13: 'Commercial / Residential', 14: 'Flooring Product Type'}


In [38]:
submission_data = pd.read_csv("data/sample_submission.csv")
print(submission_data.head(10))

       id  Indoor/Outdoor  Commercial / Residential  ENERGY STAR Certified  \
0  114689             0.0                       0.0                    0.0   
1  183172             0.0                       0.0                    0.0   
2  217304             0.0                       0.0                    0.0   
3  184115             0.0                       0.0                    0.0   
4  103786             0.0                       0.0                    0.0   
5  202789             0.0                       0.0                    0.0   
6  208062             0.0                       0.0                    0.0   
7  177644             0.0                       0.0                    0.0   
8  117538             0.0                       0.0                    0.0   
9  198085             0.0                       0.0                    0.0   

   Hardware Included  Package Quantity  Flooring Product Type  Color  \
0                0.0               0.0                    0.0    0.0 

In [39]:
for i in range(len(Test_labels)):
    submission_data.at[i,mapping[Test_labels[i]]] = 1

print(submission_data.head(10))

submission_data.to_csv("FinalSubmission.csv", encoding='utf-8', index=False)

       id  Indoor/Outdoor  Commercial / Residential  ENERGY STAR Certified  \
0  114689             0.0                       0.0                    0.0   
1  183172             0.0                       0.0                    0.0   
2  217304             0.0                       0.0                    0.0   
3  184115             0.0                       0.0                    0.0   
4  103786             0.0                       0.0                    0.0   
5  202789             0.0                       0.0                    0.0   
6  208062             0.0                       0.0                    0.0   
7  177644             1.0                       0.0                    0.0   
8  117538             1.0                       0.0                    0.0   
9  198085             0.0                       1.0                    0.0   

   Hardware Included  Package Quantity  Flooring Product Type  Color  \
0                0.0               0.0                    0.0    0.0 