In [102]:
# imports
import pandas as pd
# Import Statements
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from xgboost import XGBClassifier

In [3]:
train = pd.read_csv('/Users/ianforrest/Desktop/coding/repos/ianforrest11/DS-Unit-4-Sprint-1-NLP/module3-document-classification/kaggle/train.csv')
test = pd.read_csv('/Users/ianforrest/Desktop/coding/repos/ianforrest11/DS-Unit-4-Sprint-1-NLP/module3-document-classification/kaggle/test.csv')

In [46]:
train.head()

Unnamed: 0,id,description,category
0,1,A marriage of 13 and 18 year old bourbons. A m...,2
1,2,There have been some legendary Bowmores from t...,1
2,3,This bottling celebrates master distiller Park...,2
3,4,What impresses me most is how this whisky evol...,1
4,9,"A caramel-laden fruit bouquet, followed by une...",2


In [48]:
train['description'][0]

'A marriage of 13 and 18 year old bourbons. A mature yet very elegant whiskey, with a silky texture and so easy to embrace with a splash of water. Balanced notes of honeyed vanilla, soft caramel, a basket of complex orchard fruit, blackberry, papaya, and a dusting of cocoa and nutmeg; smooth finish. Sophisticated, stylish, with well-defined flavors. A classic!'

In [56]:
train['description'][2]

"This bottling celebrates master distiller Parker Beam's 50 years of service by including whiskey from each of the past five decades. This is a fabulous whiskey: seamless and incredibly complex, with an impeccable marriage of youth and maturity. It’s also very even-keeled throughout -- quite different than last year’s equally impressive PHC, a 27 year old, whose personality was more like an exhilarating old wooden rollercoaster ride (and also brandished more oak).\xa0Look for candied citrus, nectarine, blueberry, and sultana anchored by a nougat center, laced with honeyed vanilla and orange creamsicle. There’s a dusting of cocoa powder, brittle mint, and cinnamon, too. Tobacco leaves, polished leather, and teasing bourbon barrel char round out the palate, emerging more prominently towards a warming finish. A classic!"

In [16]:
X_train = train.drop(columns = ['category','id'])
y_train = train['category']

In [17]:
train['category'].value_counts()

1    1637
2     449
3     300
4     200
Name: category, dtype: int64

In [18]:
test.head()

Unnamed: 0,id,description
0,955,"Think carnival aromas—the good ones, anyway—me..."
1,3532,"A blend of three bourbons, between 6 and 12 ye..."
2,1390,"The nose is focused on cereal, hints of fresh ..."
3,1024,Swiss-based Chapter 7 released this 19 year ol...
4,1902,Valkyrie replaces the current Dark Origins exp...


In [165]:
test.shape

(288, 2)

In [25]:
rfc = RandomForestClassifier()
vect = TfidfVectorizer(stop_words='english')

pipe = Pipeline([('vect', vect), ('rfc', rfc)])

In [26]:
pipe.fit(train['description'], train['category'])



Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [79]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__min_df': (.02, .05),
    'vect__max_features': ( 100, 500, 1000),
    'rfc__n_estimators': (20, 125, 450),
}

clf = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1)

In [80]:
clf.fit(train['description'], train['category'])

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'vect__max_df': (0.5, 0.75, 1.0), 'vect__min_df': (0.02, 0.05), 'vect__max_features': (100, 500, 1000), 'rfc__n_estimators': (20, 125, 450)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [81]:
# init 0.8986852281515855
# decrease n_estimators 0.8967517401392111
# added max depth 0.8863109048723898
# decreased max depth 0.8859242072699149
# increased max depth, vect max df 0.8878576952822892
# increased max depth 0.8936581593194122
# increase n_estimators 0.8940448569218871
# increase n_estimators, max depth 0.8932714617169374
# remove max depth parameter, leave increased n_estimators 0.8952049497293116
# reduce max df, increase n estimators 0.8982985305491106
# reduce number of estimators 0.897138437741686
# reduce n_estimators 0.8975251353441609
# increase n_estimators 0.8967517401392111
# reduce n_estimators 0.8967517401392111
# increse n_estimators 0.897138437741686
clf.best_score_

0.897138437741686

In [163]:
# svd and sgdc
svd = TruncatedSVD(algorithm='randomized',
                  n_iter=15, random_state=42)
sgdc = SGDClassifier(random_state=42)

pipe2 = Pipeline([('vect', vect), 
                  ('svd', svd),
                  ('sgdc', sgdc)])

parameters2 = {
    'svd__n_components': (100, 300, 2000),
    'sgdc__max_iter': (500, 1000, 3000),
}
clf2 = GridSearchCV(pipe2, parameters2, cv=5, n_jobs=-1)

In [164]:
clf2.fit(train['description'], train['category'])



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...dom_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'svd__n_components': (100, 300, 2000), 'sgdc__max_iter': (500, 1000, 3000)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [166]:
# 0.9331013147718484
# 0.9338747099767981
clf2.best_score_

0.9334880123743233

In [159]:
# Predictions on test sample
pred = clf2.predict(test['description'])

In [167]:
test['category'] = pred

In [168]:
test.head()

Unnamed: 0,id,description,category
0,955,"Think carnival aromas—the good ones, anyway—me...",2
1,3532,"A blend of three bourbons, between 6 and 12 ye...",2
2,1390,"The nose is focused on cereal, hints of fresh ...",4
3,1024,Swiss-based Chapter 7 released this 19 year ol...,1
4,1902,Valkyrie replaces the current Dark Origins exp...,1


In [169]:
combo = pd.concat([train, test])

In [170]:
clf2.fit(combo['description'], combo['category'])



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...dom_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'svd__n_components': (100, 300, 2000), 'sgdc__max_iter': (500, 1000, 3000)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [171]:
clf2.best_score_

0.9418928322894919

In [172]:
# Predictions on test sample
pred = clf2.predict(test['description'])

In [173]:
submission = pd.DataFrame({'id': test['id'], 'category':pred})
submission['category'] = submission['category'].astype('int64')

In [174]:
# Make Sure the Category is an Integer
submission.head()

Unnamed: 0,id,category
0,955,2
1,3532,2
2,1390,4
3,1024,1
4,1902,1


In [175]:
# Save your Submission File
# Best to Use an Integer or Timestamp for different versions of your model
submission.to_csv('/Users/ianforrest/Desktop/coding/repos/ianforrest11/DS-Unit-4-Sprint-1-NLP/module3-document-classification/kaggle/submission4.csv', index=False)