In [1]:
import numpy as np
import pandas as pd

# 1. Cleaning Data

In [2]:
# Import data
df = pd.read_csv('combined_scraped_tweets.csv')

In [3]:
# Remove links
df['text_no_links'] = df['text'].str.replace('http\S+|www.\S+', '', case=False)

In [4]:
# Remove punctuation
df['text_punct'] = df['text_no_links'].str.replace('[^\w\s]','')

In [5]:
# Remove all other nonletters/nonspaces
df['letters_only'] = df['text_no_links'].str.replace('[^a-zA-Z\s]', '', case=False)
df['letters_only'] = df['letters_only'].str.replace('\\n', ' ', case=False)
df['letters_only'] = df['letters_only'].str.replace('\\xa0', ' ', case=False)

In [6]:
# Set all text to lowercase
df['clean_words'] = df['letters_only'].str.lower()

In [7]:
# Write to csv
df.to_csv('./cleaned_tweets.csv')

# 2. Preprocessing

In [46]:
# read in data
df = pd.read_csv('./cleaned_tweets.csv')

In [42]:
df.shape

(44472, 15)

## 2.1 Lemmatizing

In [47]:
# imports
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

In [48]:
# Instantiate tokenizer and lemmatizer
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()

In [49]:
# Function that 
def lemma(text):
    tokens = tokenizer.tokenize(str(text))
    lems = [lemmatizer.lemmatize(i) for i in tokens]
    
    return(" ".join(lems))

In [50]:
# Lemmatize tweets
df['lems'] = df['clean_words'].apply(lambda x: lemma(x))

In [51]:
df['lems'][68]

'congrats to cb trucking inc on the purchase of this utility flatbed if youre looking for a flatbed give u a call today and let u help you new and used trailer in'

##### 2.2 Natural Language Processing

### 2.2.1 Train-Test Split

In [13]:
# Import train-test split
from sklearn.model_selection import train_test_split

In [14]:
# Set X and y; perform train-test split
X = df['lems']
y = df['disaster_happened']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)

### 2.2.2 Vectorizers

In [15]:
# Imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#### 2.2.2.1 CountVectorizer

#### 2.2.2.2 TF-IDF

# 3. Modeling

In [16]:
# Baseline
df['disaster_happened'].value_counts(normalize=True)

0    0.530266
1    0.469734
Name: disaster_happened, dtype: float64

## 3.1 Searching for Best Model

In [17]:
# Import Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline

In [23]:
# Store the types of models
classifiers = [#LogisticRegression(), 
               #KNeighborsClassifier(),
               RandomForestClassifier(), 
               #AdaBoostClassifier(), 
               #SVC()
]

names = [#'Logistic Reg',
         #'kNN', 
         'Random Forest',
         #'AdaBoost',
         #'SVC'
]

In [24]:
parameters = [ 
              { # Logistic Regression (Total: 40)
#                  'vect__ngram_range': [(1, 1), (1, 2)], # 2
#                  'clf__C': np.logspace(-5, 0, 10), # 10
#                  'clf__penalty': ['l1', 'l2'], # 2
              },
                
              { # kNN (Total: 20)
#                  'vect__ngram_range': [(1, 1), (1, 2)], # 2
#                  'clf__n_neighbors': range(5, 10), # 5
#                  'clf__weights': ['uniform', 'distance'], # 2
#                  'clf__n_jobs': [-2]
              },
                
              { # Random Forest (Total: 20)
                  #'vect__ngram_range': [(1, 1), (1, 2), (1, 3)], # 2
                  'clf__n_estimators': range(5, 30), # 5
                  'clf__criterion': ['gini', 'entropy'], # 2
                  'clf__n_jobs': [-2]
              },
                
              { # AdaBoost (Total: 6)
#                  'vect__ngram_range': [(1, 1), (1, 2)], # 2
#                  'clf__n_estimators': [10, 30, 50] # 3
              },
                
              { # SVC (Total: 20)
#                  'vect__ngram_range': [(1, 1), (1, 2)], # 2
#                  'clf__C': range(1, 10), # 10
#                  'clf__kernel': ['rbf'], # 1
                  
              }
             ]

In [29]:
score = 0

for name, classifier, params in zip(names, classifiers, parameters):
    clf_pipe = Pipeline([
        ('vect', TfidfVectorizer(stop_words='english', max_features=30_000, ngram_range=(1, 3))),
        ('clf', classifier),
    ])
    gs_clf = GridSearchCV(clf_pipe, param_grid=params, n_jobs=-1)
    clf = gs_clf.fit(X_train, y_train)
    
    test_score = clf.score(X_test, y_test)
    if test_score > score:
        score = test_score
        best_mod = clf
        
    print("For {}:".format(name))
    print('Train score: {}'.format(clf.score(X_train, y_train)))
    print('Test score:  {}'.format(test_score))
    print('\n --- \n')



For Random Forest:
Train score: 0.9537744940571796
Test score:  0.5887423174936292

 --- 



## 3.2 Optimizing Best Model

In [18]:
from sklearn.feature_extraction import text

# Build stop words list
stop_words_1 = text.ENGLISH_STOP_WORDS.union(set(['ago',
 'america',
 'ball',
 'birthday',
 'cat',
 'excited',
 'family',
 'glad',
 'health',
 'home',
 'key',
 'kid',
 'labor',
 'little',
 'morning',
 'return',
 'rt',
 'straight',
 'walk',
 'water',
 'wednesday',
 'weekend',
 'wow',
 'aint',
 'better',
 'brother',
 'couple',
 'drive',
 'ga',
 'georgia',
 'great',
 'hfd',
 'home',
 'im',
 'leave',
 'line',
 'lo',
 'mean',
 'pick',
 'sc',
 'school',
 'september',
 'sign',
 'sky',
 'sound',
 'south',
 'starting',
 'stop',
 'support',
 'true',
 'tweet',
 'wedding',
 'west',
 'win',
 'wish',
 'word',
 'ya',
 'york'
]))


# Instantiate TFIDF Vectorizer
tfidf = TfidfVectorizer(stop_words=stop_words_1, max_features = 30_000, ngram_range=(1, 4))

# Fit and Transform train, Transform test
train_features = tfidf.fit_transform(X_train)
test_features = tfidf.transform(X_test)


# Instantiate Random Forest
random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', random_state=42, )

# Fit Random Forest to training set
random_forest.fit(train_features, y_train)

# Score train, test sets
random_forest.score(train_features, y_train), random_forest.score(test_features, y_test)

(0.9699967876646322, 0.5921900764503073)

In [19]:
print(random_forest.get_params)

<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)>


In [20]:
# Looking at Confusion Matrix
from sklearn.metrics import confusion_matrix


# Get predictions
y_preds = random_forest.predict(test_features)


# Generate confusion matrix
confusion_matrix(y_test, y_preds)

array([[5175, 1926],
       [3515, 2726]])