## Import Libraries

In [222]:
import numpy as np
import pandas as pd

In [223]:
# Set display settings for dataframes
pd.set_option("display.max_rows", 2000)
pd.set_option("display.max_columns", 2000)
pd.set_option("display.max_colwidth", None)

## Read csv Files

In [224]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

In [225]:
df_train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,"PolySciMajor EPA chief doesn't think carbon dioxide is main cause of global warming and.. wait, what!? https://t.co/yeLvcEFXkC via @mashable",625221
1,1,It's not like we lack evidence of anthropogenic global warming,126103
2,2,RT @RawStory: Researchers say we have three years to act on climate change before it’s too late https://t.co/WdT0KdUr2f https://t.co/Z0ANPT…,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year in the war on climate change https://t.co/44wOTxTLcD,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, sexist, climate change denying bigot is leading in the polls. #ElectionNight",466954


In [226]:
df_test.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make sure that it is not alone in fighting climate change… https://t.co/O7T8rCgwDq,169760
1,Combine this with the polling of staffers re climate change and womens' rights and you have a fascist state. https://t.co/ifrm7eexpj,35326
2,"The scary, unimpeachable evidence that climate change is already here: https://t.co/yAedqcV9Ki #itstimetochange #climatechange @ZEROCO2_;..",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPutin got to you too Jill ! \nTrump doesn't believe in climate change at all \nThinks it's s hoax,476263
4,RT @FakeWillMoore: 'Female orgasms cause global warming!'\n-Sarcastic Republican,872928


In [227]:
df_train["sentiment"].nunique()

4

In [228]:
df_train["sentiment"].unique()

array([ 1,  2,  0, -1], dtype=int64)

In [229]:
df_train["sentiment"].value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

Replace the 4 classes into 2x classes: Class 1 = 1, believe in climate change 1 and 2. Class 2 = 0, not believe in climate change -1 and 0.

In [230]:
df_train["sentiment"].replace([1, 2], 1, inplace=True)
df_train["sentiment"].replace([0, -1], -1, inplace=True)

In [231]:
df_train["sentiment"].value_counts()

 1    12170
-1     3649
Name: sentiment, dtype: int64

Remove some punctuation and web-urls

In [232]:
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
df_train["message"] = df_train["message"].replace(to_replace = pattern_url, value = subs_url, regex = True)

In [233]:
df_train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,"PolySciMajor EPA chief doesn't think carbon dioxide is main cause of global warming and.. wait, what!? url-web via @mashable",625221
1,1,It's not like we lack evidence of anthropogenic global warming,126103
2,1,RT @RawStory: Researchers say we have three years to act on climate change before it’s too late url-web url-web…,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year in the war on climate change url-web,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, sexist, climate change denying bigot is leading in the polls. #ElectionNight",466954


In [234]:
df_train["message"] = df_train["message"].str.lower()

In [235]:
df_train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,"polyscimajor epa chief doesn't think carbon dioxide is main cause of global warming and.. wait, what!? url-web via @mashable",625221
1,1,it's not like we lack evidence of anthropogenic global warming,126103
2,1,rt @rawstory: researchers say we have three years to act on climate change before it’s too late url-web url-web…,698562
3,1,#todayinmaker# wired : 2016 was a pivotal year in the war on climate change url-web,573736
4,1,"rt @soynoviodetodas: it's 2016, and a racist, sexist, climate change denying bigot is leading in the polls. #electionnight",466954


In [236]:
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [237]:
def remove_punctuation(message):
    return ''.join([l for l in message if l not in string.punctuation])

In [238]:
df_train["message"] = df_train["message"].apply(remove_punctuation)

In [239]:
df_train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,polyscimajor epa chief doesnt think carbon dioxide is main cause of global warming and wait what urlweb via mashable,625221
1,1,its not like we lack evidence of anthropogenic global warming,126103
2,1,rt rawstory researchers say we have three years to act on climate change before it’s too late urlweb urlweb…,698562
3,1,todayinmaker wired 2016 was a pivotal year in the war on climate change urlweb,573736
4,1,rt soynoviodetodas its 2016 and a racist sexist climate change denying bigot is leading in the polls electionnight,466954


Use feature extraction module CountVectorizer

In [240]:
from sklearn.feature_extraction.text import CountVectorizer

In [241]:
vect = CountVectorizer()
vect.fit(df_train["message"])

CountVectorizer()

In [242]:
print(vect.vocabulary_)



In [243]:
vector = vect.transform(df_train["message"])

In [244]:
print(vector.shape)

(15819, 24819)


In [245]:
print(type(vector))

<class 'scipy.sparse.csr.csr_matrix'>


In [246]:
print(vector.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


Train test model:

In [247]:
from sklearn.model_selection import train_test_split

In [248]:
X = vector
y = df_train["sentiment"]

In [249]:
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [250]:
from sklearn.linear_model import LogisticRegression

In [251]:
lr = LogisticRegression(max_iter=200)

In [252]:
lr.fit(X_train,y_train)

LogisticRegression(max_iter=200)

In [253]:
pred_lr = lr.predict(X_test)

Checkl metrics

In [254]:
from sklearn.metrics import classification_report

In [255]:
print('Classification Report')
print(classification_report(y_test, pred_lr, target_names=['0: Nope - climate change', '1: Yeah - climate change']))

Classification Report
                          precision    recall  f1-score   support

0: Nope - climate change       0.76      0.60      0.67       743
1: Yeah - climate change       0.88      0.94      0.91      2421

                accuracy                           0.86      3164
               macro avg       0.82      0.77      0.79      3164
            weighted avg       0.86      0.86      0.86      3164



In [256]:
vector = vect.transform(df_test["message"])

In [257]:
model_submit_results = lr.predict(vector)

In [258]:
pd.Series(model_submit_results).value_counts()

 1    8385
-1    2161
dtype: int64

In [259]:
df_submit = df_test[['tweetid']]
df_submit["sentiment"] = model_submit_results

In [260]:
from datetime import datetime
now = datetime.now().strftime("%Y%m%d%H%M")
df_submit.to_csv(f"submissions/ae2_leon_{now}.csv", index=False)