In [1]:
#import packages
import pandas as pd
from nltk.tokenize import TweetTokenizer

## Data Cleaning (preliminary)

In [2]:
tweets = pd.read_pickle("../data/combined_train.pkl")

In [3]:
tweets.head()

Unnamed: 0,tweet_text,informative
0,"CONGRATS ON HITTING YOIR GOAL GUYS, I'm sure t...",0
1,RT @ajwamood: #ajwamood : Harvey the first maj...,1
2,RT @ajwamood: #ajwamood : Harvey the first maj...,1
3,RT @ajwamood: #ajwamood : Harvey the first maj...,1
4,RT @ajwamood: #ajwamood : Harvey the first maj...,1


In [4]:
#make tweets lowercase
lower_tweets = []
for tweet in tweets.tweet_text:
    lower_tweets.append(tweet.lower())

lower_tweets = pd.DataFrame(lower_tweets)

lower_tweets.rename(columns={0:"tweet"}, inplace=True)

tweets.tweet_text = lower_tweets.tweet

#truncate elongations (NO CODE FOR NOW)
#>>> import re
#>>> re.sub(r'(.)\1+', r'\1\1', "haaaaapppppyyy")  #https://stackoverflow.com/questions/10072744/remove-repeating-characters-from-words
#spell out numbers (NO CODE FOR NOW)

#replace all usernames with 'USERID'
tweets.tweet_text = tweets.tweet_text.replace(r'@\S+', 'USERID', regex=True)

#remove 'USERID'
userid_tweets = []
for tweet in tweets.tweet_text:
    userid_tweets.append(tweet.replace('USERID', ''))

userid_tweets = pd.DataFrame(userid_tweets)

userid_tweets.rename(columns={0:"tweet"}, inplace=True)
tweets.tweet_text = userid_tweets.tweet


#replace all URLs with HTTP
tweets.tweet_text = tweets.tweet_text.replace(r'http\S+', 'HTTP', regex=True).replace(r'www\S+', 'HTTP', regex=True)

#remove HTTP
http_tweets = []
for tweet in tweets.tweet_text:
    http_tweets.append(tweet.replace('HTTP', ''))

http_tweets = pd.DataFrame(http_tweets)

http_tweets.rename(columns={0:"tweet"}, inplace=True)
tweets.tweet_text = http_tweets.tweet

#remove "RT" string
tweets.tweet_text = tweets.tweet_text.map(lambda x: x.lstrip('rt'))

#remove remaining punctuation except for "#"
tweets.tweet_text = tweets.tweet_text.str.replace("[^a-zA-Z]", " ")

In [5]:
tweets.head()

Unnamed: 0,tweet_text,informative
0,congrats on hitting yoir goal guys i m sure t...,0
1,ajwamood harvey the first major hurricane...,1
2,ajwamood harvey the first major hurricane...,1
3,ajwamood harvey the first major hurricane...,1
4,ajwamood harvey the first major hurricane...,1


In [6]:
tweets = tweets.drop_duplicates(subset=["tweet_text"])

In [7]:
tweets.shape

(15026, 2)

In [8]:
tweets.head()

Unnamed: 0,tweet_text,informative
0,congrats on hitting yoir goal guys i m sure t...,0
1,ajwamood harvey the first major hurricane...,1
5,when we get back to schs after harvey hits,0
9,not always good when your city shows up on a s...,1
10,side by side satellite images compare the si...,1


In [9]:
tweets.to_pickle("../data/tweets_cleaned2.pkl")

In [10]:
tweets.informative.value_counts(1)

1    0.698523
0    0.301477
Name: informative, dtype: float64

## Preprocessing (preliminary)

In [69]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
#Name variables 

X = tweets[["tweet_text"]]
y = tweets["informative"]

#train test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=y)

## Preliminary Model: CountVectorizer and Logistic Regression 

In [71]:
#CountVectorizer 
tknzr = TweetTokenizer()
cvec = CountVectorizer(stop_words="english", tokenizer=tknzr.tokenize)
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['tweet_text']).todense(), columns = cvec.get_feature_names())
X_test_cvec = pd.DataFrame(cvec.transform(X_test['tweet_text']).todense(), columns = cvec.get_feature_names())
lr = LogisticRegression()
lr_model = lr.fit(X_train_cvec, y_train)
predictions = lr.predict(X_test_cvec)



In [72]:
cross_val_score(lr, X_train_cvec, y_train, cv=5).mean()



0.7899762613670223

In [73]:
lr.score(X_train_cvec, y_train)

0.9393078970718722

In [74]:
lr.score(X_test_cvec, y_test)

0.8080915624168219