In [54]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import re
import nltk
from nltk.corpus import stopwords
import string
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

## Overview of Data

In [21]:
df = pd.read_csv('../Data/train.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


## Split into train and test groups

In [23]:
X = df[['keyword', 'location', 'text']]
y = df['target']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=18)

In [25]:
X_train

Unnamed: 0,keyword,location,text
355,army,,Vote for #Directioners vs #Queens in the 5th r...
1570,cliff%20fall,,#FunnyNews #Business Watch the moment a cliff ...
7198,weapon,,Back to back like I'm on the cover of lethal w...
2614,destruction,,Crackdown 3 Destruction Restricted to Multipla...
6977,twister,,It's alil twister at Tha end to! I was like oh...
...,...,...,...
1726,collided,"Johannesburg, South Africa",2 pple have been confirmed dead and over 20 re...
2885,drought,"Los Angeles, CA",'It's an eerie way of revealing both our histo...
1144,bombing,,Japan Marks 70th Anniversary of Hiroshima Atom...
4371,hijacker,,Medieval airplane hijacker testa: earnings the...


### Remove Stopwords & Tokenize Text

In [26]:
#Create List of stopwords & punctuation
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``']

In [27]:
def process_tweet(tweet_text):
    
    #Remove url links from text
    tweet_text = re.sub(r"http\S+", "", tweet_text)
    
    #Tokenize text using NLTK function
    tokens = nltk.word_tokenize(tweet_text)
    
    #Make all words lowercase and remove words in stopwords_list
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    
    return stopwords_removed        

In [28]:
X_train_processed = list(map(process_tweet, X_train['text']))

### EDA - Frequency Distribution

Find total unique words in the dataset.

In [29]:
#Use a set so that no duplicate words are counted
total_vocab = set()
for text in X_train_processed:
    total_vocab.update(text)
len(total_vocab)

15132

In [30]:
articles_concat = []
for article in X_train_processed:
    articles_concat += article

articles_freqdist = FreqDist(articles_concat)
articles_freqdist.most_common(50)

[("'s", 605),
 ("n't", 342),
 ('like', 263),
 ('amp', 259),
 ("'m", 184),
 ('fire', 181),
 ('get', 166),
 ('via', 163),
 ('new', 160),
 ('news', 153),
 ('people', 144),
 ('one', 140),
 ('video', 130),
 ('disaster', 119),
 ('2', 118),
 ('emergency', 115),
 ('would', 106),
 ('police', 103),
 ("'re", 101),
 ('still', 95),
 ('man', 93),
 ('body', 92),
 ('back', 91),
 ('..', 91),
 ('going', 91),
 ('crash', 91),
 ('got', 90),
 ('storm', 89),
 ('day', 88),
 ('us', 88),
 ('california', 84),
 ('burning', 84),
 ('know', 81),
 ('suicide', 79),
 ('time', 79),
 ('two', 78),
 ('today', 78),
 ('buildings', 78),
 ('ca', 78),
 ('youtube', 78),
 ('see', 77),
 ('love', 76),
 ('first', 76),
 ('world', 75),
 ('killed', 75),
 ('families', 75),
 ('fires', 74),
 ('rt', 74),
 ('nuclear', 74),
 ('attack', 74)]

### Vectorize with TF-IDF

The TfidfVectorizer() function takes in whole blocks of text, not individual words.  Therefore, we join the lists stored in X_train_processed

In [31]:
text_processed = []
for text in X_train_processed:
    text_string = ' '.join(text)
    text_processed.append(text_string)

In [32]:
len(text_processed)

5709

Correct number of text entries.  Now join with dataframe.

In [33]:
text_processed[0]

'vote directioners vs queens 5th round billboard fanarmyfaceoff'

In [34]:
text_processed = pd.Series(text_processed)

In [35]:
X_train.reset_index(inplace=True)

In [36]:
X_train['text_processed'] = text_processed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [37]:
X_train

Unnamed: 0,index,keyword,location,text,text_processed
0,355,army,,Vote for #Directioners vs #Queens in the 5th r...,vote directioners vs queens 5th round billboar...
1,1570,cliff%20fall,,#FunnyNews #Business Watch the moment a cliff ...,funnynews business watch moment cliff collapse...
2,7198,weapon,,Back to back like I'm on the cover of lethal w...,back back like 'm cover lethal weapon
3,2614,destruction,,Crackdown 3 Destruction Restricted to Multipla...,crackdown 3 destruction restricted multiplayer...
4,6977,twister,,It's alil twister at Tha end to! I was like oh...,'s alil twister tha end like oh nah
...,...,...,...,...,...
5704,1726,collided,"Johannesburg, South Africa",2 pple have been confirmed dead and over 20 re...,2 pple confirmed dead 20 rescued many went mis...
5705,2885,drought,"Los Angeles, CA",'It's an eerie way of revealing both our histo...,'it 's eerie way revealing history possible fa...
5706,1144,bombing,,Japan Marks 70th Anniversary of Hiroshima Atom...,japan marks 70th anniversary hiroshima atomic ...
5707,4371,hijacker,,Medieval airplane hijacker testa: earnings the...,medieval airplane hijacker testa earnings dist...


In [38]:
vectorizer = TfidfVectorizer(strip_accents='unicode', lowercase=True)

tf_idf_data_train = vectorizer.fit_transform(X_train['text_processed'])

# tf_idf_data_test = vectorizer.transform(newsgroups_test.data)

In [40]:
tf_idf_data_train.shape

(5709, 14146)

Our data contains 5709 articles with 14k unique words in the vocabulary.

## Random Forest Classifier

In [52]:
#Create pipeline
pipe_forest = Pipeline([('forest', RandomForestClassifier(random_state=70, n_jobs=-1, bootstrap=True))])

In [59]:
# Create the grid parameter
grid_forest = [{'forest__n_estimators': [100, 200, 300],
             'forest__max_depth': [1, 5, 15, 25, 50],
             'forest__min_samples_split': [2, 5, 10, 25, 50], 
             'forest__min_samples_leaf': [1, 3, 5, 10, 25], 
             'forest__criterion': ['gini', 'entropy'],
             'forest__max_features': ['auto', 'sqrt', 'log2'],
             'forest__max_samples': [None, .2, .5, .8]
             }]

# Create the grid, with "pipe" as the estimator
gridsearch_forest = RandomizedSearchCV(estimator=pipe_forest, 
                          param_distributions=grid_forest, 
#                           scoring=['r2', 'neg_root_mean_squared_error'], #Include RMSE in Results
#                           refit='r2', #Choose best model based on R^2
                          return_train_score=True, #Include training results in cv_results
                          cv=5, #Use 5 folds in CV process
                          n_iter=20, #Try 20 hyperparameter combinations
                          n_jobs=-1, #Use paralell computing
                          verbose=8) #Give updates on progress during fitting

In [60]:
gridsearch_forest.fit(tf_idf_data_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   22.8s
[Parallel(n_jobs=-1)]: Done  98 out of 100 | elapsed:   32.9s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   32.9s finished


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('forest',
                                              RandomForestClassifier(n_jobs=-1,
                                                                     random_state=70))]),
                   n_iter=20, n_jobs=-1,
                   param_distributions=[{'forest__criterion': ['gini',
                                                               'entropy'],
                                         'forest__max_depth': [1, 5, 15, 25,
                                                               50],
                                         'forest__max_features': ['auto',
                                                                  'sqrt',
                                                                  'log2'],
                                         'forest__max_samples': [None, 0.2, 0.5,
                                                                 0.8],
                                         '

In [63]:
gridsearch_forest.best_params_

{'forest__n_estimators': 300,
 'forest__min_samples_split': 2,
 'forest__min_samples_leaf': 3,
 'forest__max_samples': None,
 'forest__max_features': 'auto',
 'forest__max_depth': 25,
 'forest__criterion': 'entropy'}

In [64]:
gridsearch_forest.best_score_

0.698194351284936

In [67]:
gridsearch_forest_df = pd.DataFrame.from_dict(gridsearch_forest.cv_results_)
best_models = gridsearch_forest_df.loc[gridsearch_forest_df['rank_test_score'] < 6]
best_models

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_forest__n_estimators,param_forest__min_samples_split,param_forest__min_samples_leaf,param_forest__max_samples,param_forest__max_features,param_forest__max_depth,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
3,5.282258,0.411456,0.205505,0.001564,300,2,3,,auto,25,...,0.698194,0.007061,1,0.716006,0.706372,0.714473,0.718853,0.717163,0.714573,0.004344
7,3.437897,0.604151,0.206177,0.001984,300,25,10,0.8,auto,15,...,0.644243,0.007206,4,0.648128,0.650974,0.652945,0.654259,0.654335,0.652128,0.002341
8,3.27286,0.047754,0.207166,0.001388,300,10,3,0.8,sqrt,5,...,0.595375,0.003355,5,0.598642,0.600175,0.59711,0.597986,0.606392,0.600061,0.00332
11,2.944793,0.135482,0.106173,0.001623,200,5,1,0.8,sqrt,15,...,0.66649,0.005564,3,0.675936,0.672214,0.67725,0.680753,0.679947,0.67722,0.003054
14,1.937509,0.115371,0.104944,0.001682,100,50,3,,auto,25,...,0.69802,0.006033,2,0.722575,0.716444,0.714473,0.723451,0.721541,0.719697,0.003568
