# Personal Attacks and Political Subgroup Analysis on Twitter 

In [29]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from collections import Counter
from textblob import TextBlob
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 
init_notebook_mode(connected=True)

import urllib
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
warnings.filterwarnings('ignore')

NameError: name 'warnings' is not defined

In [2]:
data = pd.read_csv('tweet.csv')
data.drop('Unnamed: 0', axis=1, inplace=True)
data.head()

Unnamed: 0,Tweet,Type,Number of Votes
0,RT @Papapishu: Man it would fucking rule if we...,abusive,4
1,It is time to draw close to Him &#128591;&#127...,normal,4
2,if you notice me start to act different or dis...,normal,5
3,"Forget unfollowers, I believe in growing. 7 ne...",normal,3
4,RT @Vitiligoprince: Hate Being sexually Frustr...,abusive,4


In [3]:
url_re = r'http\S+'
at_re = r'@[\w]*'
rt_re = r'^[rt]{2}'
punct_re = r'[^\w\s]'

data['Tweet'] = data['Tweet'].str.lower() # Lower Case
data['Tweet'] = data['Tweet'].str.replace(url_re, '') # Remove Links/URL
data['Tweet'] = data['Tweet'].str.replace(at_re, '') # Remove @
data['Tweet'] = data['Tweet'].str.replace(rt_re, '') # Remove rt
data['Tweet'] = data['Tweet'].str.replace(punct_re, '') # Remove Punctation

data.head()

Unnamed: 0,Tweet,Type,Number of Votes
0,man it would fucking rule if we had a party ...,abusive,4
1,it is time to draw close to him 128591127995 f...,normal,4
2,if you notice me start to act different or dis...,normal,5
3,forget unfollowers i believe in growing 7 new ...,normal,3
4,hate being sexually frustrated like i wanna ...,abusive,4


## Section 1: Personal Attacks

Adapted from https://github.com/ewulczyn/wiki-detox/blob/master/src/figshare/Wikipedia%20Talk%20Data%20-%20Getting%20Started.ipynb

In [4]:
# download annotated comments and annotations

ANNOTATED_COMMENTS_URL = 'https://ndownloader.figshare.com/files/7554634' 
ANNOTATIONS_URL = 'https://ndownloader.figshare.com/files/7554637' 


def download_file(url, fname):
    urllib.request.urlretrieve(url, fname)

                
download_file(ANNOTATED_COMMENTS_URL, 'attack_annotated_comments.tsv')
download_file(ANNOTATIONS_URL, 'attack_annotations.tsv')

In [6]:
comments = pd.read_csv('attack_annotated_comments.tsv', sep = '\t', index_col = 0)
annotations = pd.read_csv('attack_annotations.tsv',  sep = '\t')

In [7]:
len(annotations['rev_id'].unique())

115864

In [8]:
# labels a comment as an atack if the majority of annotators did so
labels = annotations.groupby('rev_id')['attack'].mean() > 0.5
comments['attack'] = labels

In [9]:
# fit a simple text classifier

train_comments = comments.query("split=='train'")
test_comments = comments.query("split=='test'")

clf = Pipeline([
    ('vect', CountVectorizer(max_features = 10000, ngram_range = (1,2))),
    ('tfidf', TfidfTransformer(norm = 'l2')),
    ('clf', LogisticRegression()),
])
clf = clf.fit(train_comments['comment'], train_comments['attack'])
auc = roc_auc_score(test_comments['attack'], clf.predict_proba(test_comments['comment'])[:, 1])
print('Test ROC AUC: %.3f' %auc)





Test ROC AUC: 0.954


In [10]:
data['Personal Attack?'] = [clf.predict([tweet + ""])[0] for tweet in data['Tweet']]
data['Polarity'] = data['Tweet'].map(lambda text: TextBlob(text).sentiment.polarity)
data

Unnamed: 0,Tweet,Type,Number of Votes,Personal Attack?,Polarity
0,man it would fucking rule if we had a party ...,abusive,4,False,-0.600000
1,it is time to draw close to him 128591127995 f...,normal,4,False,0.100000
2,if you notice me start to act different or dis...,normal,5,False,-0.050000
3,forget unfollowers i believe in growing 7 new ...,normal,3,False,0.068182
4,hate being sexually frustrated like i wanna ...,abusive,4,True,-0.575000
...,...,...,...,...,...
134451,my fucking queen,abusive,5,True,-0.600000
134452,osteporosis treated with pemf rebuild bone ma...,normal,3,False,0.000000
134453,why does my phone screen keeps flickring im s...,normal,3,False,-0.562500
134454,bigdata vs reality but equally applies to any...,normal,5,False,0.000000


## Section 2: Political Subgroups

Adapted from https://github.com/chouhbik/Sentiment-Analysis-of-Tweets/blob/master/Tweets%20Analysis%20DemvsRep.ipynb

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import  accuracy_score
from sklearn.model_selection import train_test_split
from plotly.offline import init_notebook_mode,iplot
from sklearn.linear_model import LogisticRegression
from nltk.stem.lancaster import LancasterStemmer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.naive_bayes import MultinomialNB
from nltk.probability import FreqDist
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from wordcloud import WordCloud
from wordcloud import STOPWORDS
from textblob import TextBlob
from nltk.text import Text  
import nltk as nlp
import re


In [12]:
model_data = pd.read_csv("ExtractedTweets.csv")
model_data.dropna(axis = 0, inplace = True)
model_data["Party_log"] = [1 if each == "Democrat" else 0 for each in model_data.Party]

In [13]:
#add some unnecessary words to STOPWORDS list
STOPWORDS.add("rt")
STOPWORDS.add("s")
STOPWORDS.add("u")
STOPWORDS.add("amp")
STOPWORDS.add("th")
STOPWORDS.add("will")
STOPWORDS.add("t")
STOPWORDS.add("m")

In [15]:
democrat=model_data[model_data.Party=="Democrat"]
republican=model_data[model_data.Party=="Republican"]

In [24]:
democrat_list=[]
for d in democrat.Tweet:
    d=re.sub(r'http\S+', '', d) #remove links
    d=re.sub("[^a-zA-Z]", " ", d) #remove all characters except letters
    d=d.lower() #convert all words to lowercase
    d=nltk.word_tokenize(d) #split sentences into word
    d=[word for word in d if not word in STOPWORDS] #remove the stopwords
    lemma=nlp.WordNetLemmatizer() 
    d=[lemma.lemmatize(word) for word in d] #identify the correct form of the word in the dictionary
    d=" ".join(d)
    democrat_list.append(d) #append words to list

republican_list=[]
for r in republican.Tweet:
    r=re.sub(r'http\S+', '', r)
    r=re.sub("[^a-zA-Z]", " ", r)
    r=r.lower()
    r=nltk.word_tokenize(r)
    r=[word for word in r if not word in STOPWORDS]
    lemma=nlp.WordNetLemmatizer()
    r=[lemma.lemmatize(word) for word in r]
    r=" ".join(r)
    republican_list.append(r)

In [25]:
train, test = train_test_split(model_data, test_size=0.3, train_size=0.7, random_state=14)
train.shape, test.shape

((60521, 4), (25938, 4))

In [26]:
train.head()

Unnamed: 0,Party,Handle,Tweet,Party_log
53928,Republican,HouseJudiciary,Safeguarding Trade Secrets in the United State...,0
50957,Republican,RepCurbelo,@DefendOurFuture Finding new ways to de-politi...,0
39092,Democrat,repjimcooper,"The final results are in, and 3,007 new high s...",1
60545,Republican,RepHalRogers,"RT @WhiteHouse: ""Last year, I also pledged tha...",0
75135,Republican,repdonyoung,"Happy birthday to my good friend, @SpeakerRyan...",0


In [27]:
st = LancasterStemmer()

def token(text):
    txt = nltk.word_tokenize(text.lower())
    return [st.stem(word) for word in txt]


cv = CountVectorizer(lowercase=True, 
                     tokenizer=token, stop_words=STOPWORDS,
                     analyzer=u'word', min_df=4)


In [30]:
vec_train = cv.fit_transform(train['Tweet'].tolist())
vec_test = cv.transform(test['Tweet'].tolist())
prediction_input_data = cv.transform(data['Tweet'].tolist())

In [35]:
lr = LogisticRegression()
lr.fit(X = vec_train, y = train['Party_log'])

y_pred_lr=lr.predict(vec_test)
accuracy_score(y_pred_lr, test['Party_log'])

0.7869535045107564

In [36]:
y_pred_data = lr.predict(prediction_input_data)

In [37]:
data['Political Leaning'] = ["Democrat" if val == 0 else "Republican" for val in y_pred_data]
data

Unnamed: 0,Tweet,Type,Number of Votes,Personal Attack?,Polarity,Political Leaning
0,man it would fucking rule if we had a party ...,abusive,4,False,-0.600000,Democrat
1,it is time to draw close to him 128591127995 f...,normal,4,False,0.100000,Democrat
2,if you notice me start to act different or dis...,normal,5,False,-0.050000,Democrat
3,forget unfollowers i believe in growing 7 new ...,normal,3,False,0.068182,Democrat
4,hate being sexually frustrated like i wanna ...,abusive,4,True,-0.575000,Democrat
...,...,...,...,...,...,...
134451,my fucking queen,abusive,5,True,-0.600000,Republican
134452,osteporosis treated with pemf rebuild bone ma...,normal,3,False,0.000000,Democrat
134453,why does my phone screen keeps flickring im s...,normal,3,False,-0.562500,Democrat
134454,bigdata vs reality but equally applies to any...,normal,5,False,0.000000,Republican
