In [106]:
# imports
import pandas as pd
import numpy as np
import string
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.tree import DecisionTreeClassifier
import nltk
nltk.download("stopwords")
nltk.download("punkt")
#--------#
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\iforrest\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\iforrest\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [107]:
# define stopwords
stopwords = stopwords.words('english')

In [108]:
# load dataframes
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

In [109]:
# check train_df
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [110]:
# check test_df
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [111]:
# create text preprocessing function
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stopwords\
              and token != " " \
              and token.strip() not in string.punctuation]
    text = " ".join(tokens)
    return text

In [112]:
# create vectorizer
count_vectorizer = feature_extraction.text.CountVectorizer()
tfidf_vectorizer = feature_extraction.text.TfidfVectorizer(min_df=0.01, max_df=.95, ngram_range=(1,3))

In [113]:
# create list of train_text and test_text
train_text = list(train_df['text'])
test_text = list(test_df['text'])

In [114]:
# create list of texts with punctuation removed (replaced with single space)
new_train_text = []
new_test_text = []

# create translator to replace punctuation with spaces
# better score generated with punctuation removed
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space

# apply to training text
for text in train_text:
    # new_text = text.translate(translator)
    new_text = preprocess_text(text)
    new_train_text.append(new_text)
    
# apply to test text
for text in test_text:
    # new_text = text.translate(translator)
    new_text = preprocess_text(text)
    new_test_text.append(new_text)

In [115]:
# vectorize text in train data
train_vectors = count_vectorizer.fit_transform(train_df["text"])
# vectorize text in test data
test_vectors = count_vectorizer.transform(test_df["text"])

In [116]:
# alternate train vectors - tfidf, punctuation removed
train_vectors_tfidf = tfidf_vectorizer.fit_transform(new_train_text)

In [117]:
# create basic model assuming tweet text is proper indicator of disaster
clf = linear_model.RidgeClassifier()
# create second logistic regression model
lr = linear_model.LogisticRegression()
# create decisiontreeregressor model
dtc = DecisionTreeClassifier()

In [118]:
# check initial cross-validation score for RidgeClassifier
scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=3, scoring="f1")
print(scores)

[0.59421842 0.56455572 0.64082434]


In [119]:
# check cross validation scores on tfidf vectorizer
# tokenizing text, removing stop words, and removing punctuation increases scores
scores = model_selection.cross_val_score(clf, train_vectors_tfidf, train_df["target"], cv=3, scoring="f1")
print(scores)

[0.61826698 0.61220743 0.65178571]


In [120]:
# check cross validation score on tfidf vectorizer and decision tree regressor
scores = model_selection.cross_val_score(dtc, train_vectors_tfidf, train_df["target"], cv=3, scoring="f1")
print(scores)

[0.5663632  0.56279509 0.62979094]


In [33]:
# fit ridge classifier to train vectors and train target variable
clf.fit(train_vectors, train_df["target"])

RidgeClassifier()

In [37]:
# create sample submission dataframe
sample_submission = pd.read_csv('./data/sample_submission.csv')
sample_submission["target"] = clf.predict(test_vectors)
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1


In [38]:
# export submission
sample_submission.to_csv("./data/submission.csv", index=False)