In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
from nltk.stem import PorterStemmer
%matplotlib inline

df = pd.read_csv('./TweetLabel.csv')
df.head()

Unnamed: 0,USER_ID,COR_LAN,COR_LON,DATE,TIME,TWEET,Label
0,383358119,38.731,-77.43,8/11/2014,16:05:21,its like im nothing now forgotten ever sense o...,0
1,607543503,38.473,-77.46,10/28/2014,22:10:02,brainnevasleeps mikebrowncover a were u theree...,0
2,391594988,38.942,-77.45,5/18/2014,20:09:14,i always think that there will never be an inc...,0
3,414498694,39.048,-77.391,4/27/2014,16:37:51,haiderbhatti4 coincidentally the acronym of th...,0
4,1286090814,38.845,-77.444,12/18/2014,21:10:22,i am a witness and part of the first response ...,0


In [3]:
porter = PorterStemmer() ###Stemmer

"""get the list of the stopwords"""
stopwords = pd.read_csv('./Stopword.csv', header = None)
stopwords = list(stopwords.iloc[:, 0])

"""set the list of news channel"""
Newswords = ["nbcnews","onlyindc","fox5newsdc","dctraffic","wtoptraffic","washingtonpost",
   "04francene","wtop","nbcwashington","vatraffic","wtoptraffic","wtoptraff","mdtraffic","vatraff", "wtop"]
stopwords.extend(Newswords)
"""word stemming"""
stopwords = [porter.stem(x) for x in stopwords]


"""conver to lower case"""
stopwords = [x.lower() for x in stopwords]

In [4]:
#### Process the tweet
colnames = df.columns


new_tweet = []
for i in range(len(df['USER_ID'])):
    words = df['TWEET'].iloc[i].split()
    """word stemming"""
    words = [porter.stem(x) for x in words]
    """stop word filtering"""
    words = [w for w in words if not (w in stopwords or w in df.columns)]
    """get unique words"""
    words = np.unique(words)
    """put it back together"""
    
    new_tweet.append(' '.join(words) )

df['New_TWEET'] = new_tweet

In [5]:
df_new = df.copy()

### build test data and train data
coor_table = []
for i in range(len(df_new['USER_ID'])):
    words = df_new['New_TWEET'].iloc[i].split()
    for w in words:
        if w in df_new.columns:
            continue
        new_list = [1 if w in df['New_TWEET'].iloc[j] else 0 for j in range(len(df['USER_ID']))]
        coor = df_new['Label'].corr(pd.DataFrame(new_list, columns = ['this']).iloc[:, 0])
        if abs(coor) > 0.15:
            df_new[w] = new_list
            coor_table.append({'word': w, 'coorrelation': coor, 'index_in_data': i})
coor_dataframe = pd.DataFrame(coor_table)

In [30]:
###Make the dataframe with label 0 and 1 even
df_new1 = df_new[df_new['Label']==1]
df_new0 = df_new[df_new['Label']==0]
df_new_sample = df_new0.sample(df_new1.shape[0]*2)

df_sample = pd.concat([df_new1, df_new_sample])



X = df_sample[list(coor_dataframe['word'])]
y = df_sample['Label']

#Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 

lm_model = LinearRegression(normalize=True) # Instantiate
lm_model.fit(X_train, y_train) #Fit
        
# Predict the response for the training data and the test data
y_test_preds = lm_model.predict(X_test)
y_train_preds = lm_model.predict(X_train)

#Predict and score the model
test_score = r2_score(y_test, y_test_preds)
train_score = r2_score(y_train, y_train_preds)

print(test_score, train_score)


0.24624073720479212 0.36860027421257313


In [31]:
###Stemmed words which may have correlation with the accident-label
coor_dataframe

Unnamed: 0,word,coorrelation,index_in_data
0,car,0.172741,10
1,5,0.172857,11
2,lane,0.178378,29
3,traffic,0.240229,68
4,66,0.23257,75
5,block,0.200272,186
6,af,0.197105,246
7,vehicl,0.163846,505
8,accid,0.219246,618
9,9,0.174404,728
