<a href="https://colab.research.google.com/github/brighamfrandsen/econ484/blob/master/examples/tweet%20demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

First some preliminaries. Import some useful packages:

In [None]:
!git clone https://github.com/brighamfrandsen/econ484.git

import warnings
warnings.filterwarnings('ignore')
import json
import re
import pandas as pd
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import tree
from matplotlib import pyplot as plt


    more prelims

Read in raw data

In [None]:
# reading in the data
obama = pd.read_csv('econ484/data/obama_tweets.csv')
trump = pd.DataFrame(json.load(open('econ484/data/trump_tweets.json',encoding='utf8')))

print(obama.head())
print(trump.head())

In [None]:
# adding obama indicator
obama['obama_indicator'] = 1
trump['obama_indicator'] = 0

some data cleaning

In [None]:
# Renaming columns to make them consistent for concatenation
obama.rename(columns={'Text': '_text'}, inplace=True)
trump.rename(columns={'text': '_text'}, inplace=True)

# Concatenating the two datasets
both = pd.concat([obama.loc[:, ['_text', 'obama_indicator']],
                  trump.loc[:, ['_text', 'obama_indicator']]])

# Dropping retweets
both = both.loc[~both._text.str.contains('^RT'), :]
print('Shape of raw data:', both.shape)

# Cleaning the text
both['_text'] = both['_text'].str.strip()

# Replace sequences
both['_text'] = both['_text'].replace({
    r'\s+': ' ',                       # Removing extra spaces
    r'(?:: )?https?://\S+': '',         # Removing URLs
    r'\.?pic\.twitter\.com/\S+': '',    # Removing Twitter image links
    r'\d+': '',                         # Removing digits
    r'[…"#$%&\'\(\)*+,-./:;<=>?@\[\\\]^_`{|}~’“”—]': '',  # Removing punctuation
    r'–|––|\s+': ' '                   # Normalizing dashes and extra spaces
}, regex=True)

# Count the number of capital letters before converting to lowercase
both['n_cap_let'] = both['_text'].apply(lambda x: len(re.findall('[A-Z]', x)))

# Converting the text to lowercase
both['_text'] = both['_text'].str.lower()

# Removing specific words related to the context
remove_words = ['trump', 'president obama', 'obama', 'barack', 'michelle', 'amp', 'ofa']
for word in remove_words:
    both['_text'] = both['_text'].str.replace(word, '', regex=False)

# Tokenizing the text
tknzr = TweetTokenizer()
both['tokens'] = both['_text'].apply(lambda x: [re.sub('_', '', y) for y in tknzr.tokenize(x)])

# Dropping rows with empty tokens after cleaning
both = both.loc[both['tokens'].apply(lambda x: len(x) > 0), :]

# Generating feature variables
both['total_words'] = both['tokens'].apply(len)
both['avg_word_len'] = both['tokens'].apply(lambda x: sum(len(y) for y in x) / len(x) if len(x) > 0 else 0)

print("Data cleaned and features generated...")


generate features

In [None]:
vectorizer = CountVectorizer(max_df=.5, min_df=.0001, stop_words='english', tokenizer=tknzr.tokenize,
                                 ngram_range=(1, 3))
bow_mat = vectorizer.fit_transform(both._text)
print('Bag of words feature set:', bow_mat.shape)

get ready to model

In [None]:
# modelling
dtree = tree.DecisionTreeClassifier(random_state=123,max_depth=2)
dtree.fit(bow_mat, both.obama_indicator)

In [None]:
# visualize decision tree
tree.plot_tree(dtree, feature_names=vectorizer.get_feature_names_out() ,filled=True)
plt.show()

Try a new tweet:

In [None]:
new_tweet='Congrats to our newest class of foundation scholars. These leaders are working to change their communities for the better'
new_feats=vectorizer.transform([new_tweet])
dtree.predict_proba(new_feats)

In [None]:
print(new_feats)