In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib as plt
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords
import nltk
from nltk.stem.porter import PorterStemmer
import re
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
import pickle
import os



# 1) Experiment Overview

### 1.1) Overview: This project will read in and examine the sentiment of tweets related to global warming with the intent of understanding the tweets author's opinions towards the existence of climate change. With the growing threat of climate change it can be suprising that there is still a significant amount portion of the population who doubt or diminish the significance of the global trend and this data set offers insite into identifying the two groups with tweets classified into yes (climate change exists), no (climate change does not exist), and NaN (the tweet simply gives information about climate change (no opinion or sentiment). For the purposes of this sentiment analysis project the subjective tweets will be the main focus of the project as we attempt to train a model to classify whether or not a person believes in climate change based on a tweet.  

### 1.2) The Data:  The data was downloaded from a public repository on data.world and consists of the tweet itself, the determination of belief column titles 'existence' (climate change exists, does not exist, or tweet does not contain an opinion (conveying information)), and a confidence evaluation of the existence column that gives the authors relative confidence on the assesment of sentiment made in the existence column. For the puposes of this project only the tweets that were determined as conveying sentiment will be used and the confidence of that prediction will be ignored (the existence column will be taken as base truth). It is important to note that this data is relatively old having been published in 2013 however, while the amount of climate change disbelievers has decreased since then, it is still relevant in todays society. Thus, creating a model based off older tweets can still be applicable to data collected today and future data collected about coimate change beliefs.

### 1.3) Ethical Considerations: This data is public in both its collection from twitter and from the source on data.world which simply serves the pupose of aggregating otherwise widespread information that would be difficult to collect for public use from Twitter. The only potential source of ethical consideration in regards to this data set is that Twitters API for data collection maintains rules and regulations as well as an approval process to help prevent the inappropriate mining or use of mined data. Since, this data has already been collected however, we know that it was collected from twitter under those guidleines by the initial authors and since the scope of this project is not controversial in nature nor outside the intended use of the original authors it again maintains the standards of ethical data use.

# 2) Data Collection

### 2.1) Load the data from the downloaded csv (link in works cited)

In [2]:
global_warming = pd.read_csv('global_warming_sentiment.csv',encoding= 'unicode_escape')


In [3]:
global_warming.head()

Unnamed: 0,tweet,existence,existence.confidence
0,Global warming report urges governments to act...,Yes,1.0
1,Fighting poverty and global warming in Africa ...,Yes,1.0
2,Carbon offsets: How a Vatican forest failed to...,Yes,0.8786
3,Carbon offsets: How a Vatican forest failed to...,Yes,1.0
4,URUGUAY: Tools Needed for Those Most Vulnerabl...,Yes,0.8087


# 3) Data Preprocessing

### 3.1) Clean the data

In [4]:
# Drop Existence.Confidence column (not needed)
global_warming.drop(['existence.confidence'],axis =1, inplace =True)
global_warming.head()

Unnamed: 0,tweet,existence
0,Global warming report urges governments to act...,Yes
1,Fighting poverty and global warming in Africa ...,Yes
2,Carbon offsets: How a Vatican forest failed to...,Yes
3,Carbon offsets: How a Vatican forest failed to...,Yes
4,URUGUAY: Tools Needed for Those Most Vulnerabl...,Yes


In [5]:
# clean the existence column such that only binary yes / no is left represented as 1 / 0 respectively
print(global_warming['existence'].unique())
global_warming = global_warming.replace(['Y','N'],['Yes','No'])
print(global_warming['existence'].unique())
global_warming = global_warming.dropna()
print(global_warming['existence'].unique())
global_warming = global_warming.replace(['Yes','No'],[1,0]) #convert yes to 1 and no to 0
print(global_warming['existence'].unique())
global_warming.head()

['Yes' nan 'No' 'Y' 'N']
['Yes' nan 'No']
['Yes' 'No']
[1 0]


Unnamed: 0,tweet,existence
0,Global warming report urges governments to act...,1
1,Fighting poverty and global warming in Africa ...,1
2,Carbon offsets: How a Vatican forest failed to...,1
3,Carbon offsets: How a Vatican forest failed to...,1
4,URUGUAY: Tools Needed for Those Most Vulnerabl...,1


In [6]:
# Check types to make sure everything is set as needed for model creation
global_warming.dtypes

tweet        object
existence     int64
dtype: object

### 3.2) Define the functions to clean the text data, tokenize the document, and construct a TfidfVectorizer (these will be used in optimizing the SGD function in the next section as part of the grid search optimization)

In [7]:
# Optional functions for data cleaning and tokenizing
from nltk.tokenize import word_tokenize


def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    porter = PorterStemmer()
    line = [porter.stem(word) for word in text.split()]
    return line




In [8]:
#Potential functions for tfidf / hashing vector creation
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)


vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21,
                         preprocessor=None, 
                         tokenizer=tokenizer)

In [9]:
# Split data into training and test data

X = global_warming['tweet']
y = global_warming['existence']


X_train, X_test, y_train, y_test =\
    train_test_split(X, y, 
                     test_size=0.2, 
                     random_state=0, 
                     stratify=y)

# 4) Model Optimization and Serialization

### 4.1) Define the model and optimize it using a grid search

In [10]:
# Define Initial Model
model = SGDClassifier(loss='log', random_state=1)
SGD_vect = Pipeline([('vect', vect),
                     ('clf', model)])



In [11]:
SGD_vect.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'vect', 'clf', 'vect__alternate_sign', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__n_features', 'vect__ngram_range', 'vect__norm', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'clf__alpha', 'clf__average', 'clf__class_weight', 'clf__early_stopping', 'clf__epsilon', 'clf__eta0', 'clf__fit_intercept', 'clf__l1_ratio', 'clf__learning_rate', 'clf__loss', 'clf__max_iter', 'clf__n_iter_no_change', 'clf__n_jobs', 'clf__penalty', 'clf__power_t', 'clf__random_state', 'clf__shuffle', 'clf__tol', 'clf__validation_fraction', 'clf__verbose', 'clf__warm_start'])

In [15]:
#Define Parameters for tunig the initial model
stop = stopwords.words('english')
param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__preprocessor':[None,preprocessor],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__stop_words':[stop,None],
               'clf__loss' : ['log', 'modified_huber'], #only trying loss functions that support probability estimators
               'clf__penalty': ['l1', 'l2'],
               'clf__alpha': [.00001,.0001,.01,.1,1.0]}]





In [16]:
#use a grid search to optimize the model
gs_SGD = GridSearchCV(SGD_vect, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=2,
                           n_jobs=-1)

gs = gs_SGD.fit(X_train, y_train)
print(gs_SGD.best_score_)
print(gs_SGD.best_params_)


Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  4.5min finished


0.8289940828402367
{'clf__alpha': 1e-05, 'clf__loss': 'log', 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__preprocessor': <function preprocessor at 0x1253df430>, 'vect__stop_words': ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then'

In [17]:
predicted = gs.predict(X_test)
acc = accuracy_score(y_test, predicted)
print(acc)

0.834319526627219


### 4.2) Outcomes of initial fit of the model

### The above model was fitted using cross validation on grid search optimization of the model. The model itself represents a hashing vector applied to the data which is then processed via an SGD classifier. In doing so the grid search selected a hasing vector that was optimized to tokenize the data via the tokenizer_porter which both tokenizes each line and stems the words extracted as tokens to their root (via the porter method). Prior to tokenizing the hashing vector preprocessed the data to remove punctuation with the exception of emojis as defined in the function 'preprocessor' above and chose to remove stop words via the nltk.corpus stop_words package. With the data vectorized using that optimized hashing function the SGD classifier selected an l2 penalty, an alpha value of .00001 and a log loss function as the optimized parameters. With this optimization the model produces an accuracy of 82.899% on the training data which only slightly improved to an accuracy of 83.432% when applied to the test data. These selected parameters will be used in the following code to create a model that can be used in an online web application designed to continue training the data in the hopes of imporving its accuracy.

### As a note on the project requirments the SGD classifier differes from the LR classifier used in Ch08 and similarly the HashingVectorized optimized here is different than the TFIDF used in the book such that this model can be applied to an online format with updates via partial_fit as new information comes in via the web application. In addition to these obvious differences between the book and this project the hashing vector includes the preprocessor parameter to be optimized and the SGD is being optimized by analyzing the loss function while the book did not include either of these as options for the grid search. By doing so both have been updated from their defaults in order to improve the model.



### 4.3) FIt the model against the entire data set and pickle the model for use in web application

In [20]:
# redifine functions based on the optimized LR above converting the LR to a SGD classifier to accomodate online learning and the TFIDF vectorizor to a hashing vecotrizer to accomodate online learning

vect_optimized = HashingVectorizer(decode_error='ignore',
                         norm = None,
                         n_features=2**21,
                         preprocessor=preprocessor,
                         stop_words = stop,
                         tokenizer=tokenizer_porter)
                           
new_model = SGDClassifier(loss='log', random_state=1,alpha = .00001, penalty = 'l2') 
                        

In [21]:
# # test the new models created to accomodate online learning for comparison to optimized LR model
X_train_vect = vect_optimized.transform(X_train)

new_model.partial_fit(X_train_vect,y_train,classes=(0,1))

X_test_vect = vect_optimized.transform(X_test)
p = new_model.predict(X_test_vect)
acc = accuracy_score(p,y_test)

print(acc)

0.7692307692307693


#### While the accuracy has decreased slightly in changing the model to an online capable format it is still accurate enough to continue with accuracy expected to approve as more information is passed to the model via the web application.

In [22]:
# Fit the model on the entire data set to be used in the web application
X_total = vect_optimized.transform(X)
full_SGD = new_model.partial_fit(X_total,y,classes=(0,1))

# 5) Model Serialization and function exportation for Website Creation

### 5.1) Pickle the model created and optimized above

In [23]:
#Pickle the model to be used in web application (along with stop words - hashing vector does not need to be pickled)

dest = os.path.join('GlobalWarmingClassifier','website','pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)

pickle.dump(full_SGD, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)
pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4) 


### 5.2) Define the vectorizing function and test the functunality of the pickle objects created above

In [33]:
os.chdir('/Users/zaneheald/Desktop/MachineLearning/zane_heald_machine_learning/machine-learning/projects/project02/')

In [28]:
os.getcwd()

'/Users/zaneheald/Desktop/MachineLearning/zane_heald_machine_learning/machine-learning/projects/project02'

In [29]:
%%writefile GlobalWarmingClassifier/website/vectorizer.py
# Stand Alone Hashing Vector File
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
from sklearn.feature_extraction.text import HashingVectorizer
import pickle
import os

cur_dir = os.path.dirname(__file__)
stop = pickle.load(open(
                os.path.join(cur_dir, 
                'pkl_objects', 
                'stopwords.pkl'), 'rb'))

def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

def tokenizer_porter(text):
    porter = PorterStemmer()
    line = [porter.stem(word) for word in text.split()]
    return line

vect_optimized = HashingVectorizer(decode_error='ignore',
                         norm = None,
                         n_features=2**21,
                         preprocessor=preprocessor,
                         stop_words = stop,
                         tokenizer=tokenizer_porter)




Overwriting GlobalWarmingClassifier/website/vectorizer.py


### 5.3) Test the functionality of the above exported functions and pickled models

In [34]:
os.chdir('GlobalWarmingClassifier/website')

In [35]:
#check to ensure pickling worked properly

from vectorizer import vect_optimized

clf = pickle.load(open(os.path.join('pkl_objects', 'classifier.pkl'), 'rb'))

label = {0:'negative', 1:'positive'}
example = ["Global Warming is Fake News!"]
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%' %\
      (label[clf.predict(X)[0]], 
       np.max(clf.predict_proba(X))*100))



Prediction: negative
Probability: 99.97%


#### The above shows that the pickling has succesfully worked and predictions can be made regarding new 'tweets' passed to it. While the example above predicts incorrectly its purpose is to demonstrate that all the components for the web application are in place to handle future predictions.

# 6) Website Creation

### This is the link to the website created for the final portion of this project which allows user creation of tweets that are predicted as either believing in climate change (posiitive) or disbelieving (negative). All data is recorded in an SQL lite data base for futher updates to the model created above:

http://zaneheald.pythonanywhere.com/

### Please see files included under the website folder for insite into the creation of the SQLite database, the updating of the pickle file, and HTML files used to create the website.


# 7) Works Cited

Link to data world
https://data.world/crowdflower/sentiment-of-climate-change/workspace/file?filename=1377884570_tweet_global_warming.csv

