In [216]:
# Import relevant packages 
import pandas as pd
import re
import string
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from datetime import datetime

import statsmodels.formula.api as smf

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer


import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kajsarosenblad/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Advanced Social Data Science 2 (ASDS2) Exercises


## April 21: Preprocessing

### 1: Importing data without preprocessing

1. Download the data set available here, which contains the nearly 6,000 times Donald Trump insulted someone on Twitter: https://www.kaggle.com/ayushggarg/all-trumps-twitter-insults-20152021 
2. Load the csv as a data frame using pandas.
3. The variable ‘target’ has an indicator for the target of the insult. The data reveals that Trump’s most frequent insult target is ‘the media’ (‘the-media’ in the data). Create a binary indicator for whether Trump targets the media. Fit a linear regression with this binary indicator as the dependent variable and the date of the tweet as the independent variable. Does Trump become more or less likely to insult the media over time? Why might this be? 
4. Using the CountVectorizer from sklearn, convert the tweets to a document-feature matrix. What are the dimensions of the matrix?


In [217]:
#create df
df = pd.read_csv('trump_insult_tweets_2014_to_2021.csv')

In [218]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,target,insult,tweet
0,1,2014-10-09,thomas-frieden,fool,"Can you believe this fool, Dr. Thomas Frieden ..."
1,2,2014-10-09,thomas-frieden,DOPE,"Can you believe this fool, Dr. Thomas Frieden ..."
2,3,2015-06-16,politicians,all talk and no action,Big time in U.S. today - MAKE AMERICA GREAT AG...
3,4,2015-06-24,ben-cardin,It's politicians like Cardin that have destroy...,Politician @SenatorCardin didn't like that I s...
4,5,2015-06-24,neil-young,total hypocrite,"For the nonbeliever, here is a photo of @Neily..."


In [219]:
#datetime conversion
df.date = [datetime.strptime(item, '%Y-%m-%d') for item in df.date]


In [220]:
#datetime conversion
df.date = df.date.map(datetime.toordinal)

In [221]:
#binary variable media targeting 
df['media'] = np.where(df['target'] == 'the-media', 1, 0)


In [222]:
#linear regression
model = smf.ols(formula = 'media ~ date', data = df)
results = model.fit(cov_type = 'HC0')
results.summary()

0,1,2,3
Dep. Variable:,media,R-squared:,0.006
Model:,OLS,Adj. R-squared:,0.006
Method:,Least Squares,F-statistic:,93.48
Date:,"Wed, 21 Apr 2021",Prob (F-statistic):,5.08e-22
Time:,12:20:38,Log-Likelihood:,-3178.1
No. Observations:,10360,AIC:,6360.0
Df Residuals:,10358,BIC:,6375.0
Df Model:,1,,
Covariance Type:,HC0,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-31.6371,3.284,-9.633,0.000,-38.074,-25.200
date,4.31e-05,4.46e-06,9.669,0.000,3.44e-05,5.18e-05

0,1,2,3
Omnibus:,4262.419,Durbin-Watson:,0.944
Prob(Omnibus):,0.0,Jarque-Bera (JB):,13040.939
Skew:,2.255,Prob(JB):,0.0
Kurtosis:,6.141,Cond. No.,913000000.0


In [223]:
#countvectoriser
vectorizer = CountVectorizer()
vectorizer.fit_transform(df.tweet)

# encode document
vector = vectorizer.transform(df.tweet)
# summarize encoded vector
print(vector.shape)

(10360, 10057)


### 2: Preprocessing steps

1. Remove all tagged users, i.e. words starting with the ‘@’ character.
2. Lowercase all tweet text.
3. Remove numbers.
4. Remove punctuation. 
5. Remove extra whitespaces.
6. Remove default stopwords.
7. Stem words.
8. Lemmatize words.


In [224]:
#1.
df.tweet = df.tweet.str.replace(r'(\s)\@\w+','')

In [225]:
#2. 
df.tweet = [item.lower() for item in df.tweet]

In [226]:
#3.
df.tweet = df.tweet.str.replace('[0-9]', '')

In [227]:
#4.

df.tweet = [item.translate(str.maketrans('', '', string.punctuation)) for item in df.tweet]

In [228]:
#5. 
df.tweet = df.tweet.str.replace(' +', ' ')

In [229]:
#6.

df.tweet = df.tweet.str.split()
stop = stopwords.words('english')


df.tweet = df.tweet.apply(lambda x: [item for item in x if item not in stop])


In [234]:
#7.
ps = PorterStemmer()
df['stem'] = df.tweet.apply(lambda x: ' '.join([ps.stem(item) for item in x]))

In [236]:
#8. 
lemmatizer = WordNetLemmatizer()
df['lemm'] = df.tweet.apply(lambda x: ' '.join([lemmatizer.lemmatize(item) for item in x]))

### 3: Consequences of preprocessing

Create a new document-feature matrix with the preprocessed tweets. How do the dimensions of this matrix compare with those of the matrix you created in 1.3?


In [240]:
vect_stem = vectorizer.fit_transform(df.stem)

# encode document

# summarize encoded vector
print(vect_stem.shape)

vect_lemm = vectorizer.fit_transform(df.lemm)

print (vect_lemm.shape)

(10360, 7789)
(10360, 9289)
