In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [97]:
from matplotlib import rc
rc('font', family='NanumGothicOTF')
plt.rcParams['axes.unicode_minus'] = False

In [98]:
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords  
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
import nltk

nltk.data.path.append('C:/임시/nltk_data/')

In [99]:
from sentiment_lexicon import Lexicon
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

__데이터 로오오오드__

In [100]:
path='C:/임시/text_mining_for_US_election/formats/'
df = pd.read_csv(path+"news.csv")
df = df.drop(df[(df["Date"] == "2020-11-01") | (df["Date"] == "2020-11-02")].index)
df = df.reset_index()
df = df.drop('index',axis=1)
df["Date"] = pd.to_datetime(df["Date"])

__각 기사들을 문장토큰화__

In [101]:
df["Text"] = df["Text"].apply(lambda x: sent_tokenize(x))

__모든 문장들 다 쓰지 말고 트럼프나 바이든 언급 문장들만 쓰자__

In [102]:
def only_Trump_or_Biden_sentence(element,search):
    
    if search not in ['Trump','Biden']:
        print('wrong key')
        return
    
    ret_list = []
    for sentence in element:
        if search in sentence:
            ret_list.append(sentence)
     
    return ret_list

In [103]:
df['Trump_sentences'] = df['Text'].apply(lambda x : only_Trump_or_Biden_sentence(x,'Trump'))
df['Biden_sentences'] = df['Text'].apply(lambda x : only_Trump_or_Biden_sentence(x,'Biden'))

__불용어,불필요글자,소문자통일,등등__

In [104]:
def proper_tokenize(element):

    ret_list = []
    if type(element)==float:
        print(element)
        return
    for sentence in element:
        clean_words=[]
        sentence = re.sub('[^a-zA-Z-]+',' ',sentence)

        for word in word_tokenize(sentence):
            if len(word)>2 and word.lower() not in stopWords:
                if word not in ['Trump','Biden']:
                    word = word.lower()
                clean_words.append(word)

        ret_list.append(clean_words)
    return ret_list

In [105]:
stopWords = set(stopwords.words('english'))

In [106]:
df['Trump_sentences'] = df['Trump_sentences'].apply(proper_tokenize)
df['Biden_sentences'] = df['Biden_sentences'].apply(proper_tokenize)

__표제어 추출__

In [107]:
WL = WordNetLemmatizer()

In [108]:
def proper_lemmatize(word,tag):
    
    if tag[0]=='V':
        ret= WL.lemmatize(word,'v')
    elif tag[0]=='J':
        ret= WL.lemmatize(word,'a')
    elif tag[0]=='NNP':
        ret= word
    else:
        ret= WL.lemmatize(word)
        
    return ret if ret not in stopWords else ''


def apply_lemmetize(element):
    
    for i in range(len(element)):
        element[i] = [proper_lemmatize(word,tag) for (word,tag) in pos_tag(element[i])]
        
    return element        

In [109]:
df['Trump_sentences'] = df['Trump_sentences'].apply(apply_lemmetize)
df['Biden_sentences'] = df['Biden_sentences'].apply(apply_lemmetize)

__역토큰화__

In [110]:
df['Trump_sentences'] = df['Trump_sentences'].apply(lambda x : [*map(lambda y: ' '.join(y) , x)])
df['Biden_sentences'] = df['Biden_sentences'].apply(lambda x : [*map(lambda y: ' '.join(y) , x)])

__갬성 분석__

In [111]:
def get_sentiment_mean(element):
    
    size = len(element)
    ret_dict = {'neg': 0, 'neu': 0, 'pos': 0, 'compound': 0}
    
    if size:
        for sentence in element:
            score_dict = analyser.polarity_scores(sentence)
            for key in score_dict:
                ret_dict[key]+=score_dict[key]

        for key in ret_dict:
            ret_dict[key] = ret_dict[key]/size
        
    return ret_dict

In [112]:
analyser = SentimentIntensityAnalyzer()

In [113]:
Trump_score_df = pd.DataFrame(df['Trump_sentences'].apply(get_sentiment_mean).values.tolist()).add_prefix('Trump_')
Biden_score_df = pd.DataFrame(df['Biden_sentences'].apply(get_sentiment_mean).values.tolist()).add_prefix('Biden_')

In [114]:
df = pd.concat([df,Trump_score_df,Biden_score_df],axis=1)
df

Unnamed: 0,Title,Date,Press,Topic,Text,Trump_sentences,Biden_sentences,Trump_neg,Trump_neu,Trump_pos,Trump_compound,Biden_neg,Biden_neu,Biden_pos,Biden_compound
0,Newt Gingrich: Democrats prove they are the 'L...,2020-09-01,Fox,2020 Presidential Election,[Fox News Flash top entertainment and celebrit...,[election year story scandals outrageous momen...,[election year story scandals outrageous momen...,0.089643,0.741071,0.169143,0.072971,0.054308,0.774538,0.171154,0.221808
1,Ed Rollins says election could come down to Wi...,2020-09-01,Fox,2020 Presidential Election,[White House Strategic Communications Director...,[white house strategic communication director ...,[white house strategic communication director ...,0.088500,0.744500,0.167250,0.382975,0.074667,0.702667,0.223000,0.644600
2,Anti-Trump Lincoln Project launches coalition ...,2020-09-01,Fox,ELECTIONS,"[House Minority Leader Kevin McCarthy, R-Calif...",[house minority leader kevin mccarthy r-calif ...,[go city last month president different part c...,0.026000,0.793000,0.181000,0.371200,0.066000,0.811000,0.123000,0.318200
3,Biden’s tough tone on riots comes amid attacks...,2020-09-01,Fox,ELECTIONS,[Steve Hilton highlights President Trump's ful...,[steve hilton highlight president Trump fulfil...,[crunch time president Trump opponent former v...,0.070545,0.812909,0.116545,0.104764,0.028889,0.821889,0.149222,0.242411
4,NY attorney general files injunction to stop T...,2020-09-02,Fox,ELECTIONS,[President Trump holds a ‘Making America Great...,[president Trump hold make america great rally...,[],0.111750,0.658250,0.230000,0.405200,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2309,Markey overcomes Kennedy challenge in Massachu...,2020-09-01,Politico,2020 elections,[Jill Biden is tired of hearing about her husb...,[donald Trump even say word gaffe Biden say la...,"[jill Biden tire hear husband gaffe, donald Tr...",0.122125,0.717000,0.160750,-0.061888,0.099000,0.808375,0.092625,-0.095700
2310,Trump blows past the intelligence to accuse Ch...,2020-09-01,Politico,Congress,[Allies of President Donald Trump expressed su...,[ally president donald Trump express support s...,[],0.097875,0.755250,0.147000,0.042300,0.000000,0.000000,0.000000,0.000000
2311,Twitter forces Democratic candidate to delete ...,2020-09-01,Politico,2020 elections,[Former Vice President Joe Biden has opened up...,[former vice president joe Biden open -point l...,[former vice president joe Biden open -point l...,0.018417,0.784083,0.197500,0.403150,0.031571,0.793429,0.175000,0.364757
2312,DOJ announces new oversight for surveillance a...,2020-09-01,Politico,Campaigns,[TV advertising is easily the most expensive t...,[],[],0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [115]:
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from plotly.validators.scatter.marker import SymbolValidator

In [116]:
df['Trump_sentiment']=np.where(df['Trump_compound']>=0.05,1, np.where(df['Trump_compound']<=-0.05,-1,0))
df['Biden_sentiment']=np.where(df['Biden_compound']>=0.05,1, np.where(df['Biden_compound']<=-0.05,-1,0))

In [117]:
Trump_senti_groupby_date = df.groupby('Date')[['Trump_neg','Trump_neu','Trump_pos','Trump_compound','Trump_sentiment']].mean().astype('float32')
Biden_senti_groupby_date = df.groupby('Date')[['Biden_neg','Biden_neu','Biden_pos','Biden_compound','Biden_sentiment']].mean().astype('float32')

In [118]:
Trump_fig = px.bar(Trump_senti_groupby_date, x=Trump_senti_groupby_date.index, y="Trump_sentiment")
Biden_fig = px.bar(Biden_senti_groupby_date, x=Biden_senti_groupby_date.index, y="Biden_sentiment")

Trump_fig.update_layout(title_text='News about Trump')
Biden_fig.update_layout(title_text='News about Biden')

Trump_fig.show()
Biden_fig.show()

In [144]:
Trump_senti_groupby_date.columns.name='emotion'
Biden_senti_groupby_date.columns.name='emotion'

In [160]:
Trump_fig = px.area(Trump_senti_groupby_date, facet_col="emotion", facet_col_wrap=2 , title = 'Senti analysis of News text about Trump')
Biden_fig = px.area(Biden_senti_groupby_date, facet_col="emotion", facet_col_wrap=2 , title = 'Senti analysis of News text about Biden')
Trump_fig.show()
Biden_fig.show()

### News LDA

In [146]:
stopWords.update(['say','get','know','joe','would','think','go','much','that','trump','biden','donald'])

In [147]:
df[df['Text'].isnull()]

Unnamed: 0,Title,Date,Press,Topic,Text,Trump_sentences,Biden_sentences,Trump_neg,Trump_neu,Trump_pos,Trump_compound,Biden_neg,Biden_neu,Biden_pos,Biden_compound,Trump_sentiment,Biden_sentiment


In [151]:
df['Text'] = df['Text'].apply(proper_tokenize)

In [152]:
df['Text'] = df['Text'].apply(apply_lemmetize)

In [153]:
df['Text'] = df['Text'].apply(lambda x : [*map(lambda y: ' '.join(y) , x)])

__Press 별 코퍼스 생성__  
- F_corpus : Fox  
- NR_corpus : NPR , Reuters  
- CP_corpus : CBS , Politico

In [135]:
def join_small_corpus(chunk):
    ret = []
    for small in chunk:
        ret+=small
    return ret

In [154]:
F_corpus = join_small_corpus(df[df.Press=='Fox']['Text'].values.tolist())
NR_corpus = join_small_corpus(df[(df.Press=='NPR')|(df.Press=='Reuters')]['Text'].values.tolist())
CP_corpus = join_small_corpus(df[(df.Press=='CBS')|(df.Press=='Politico')]['Text'].values.tolist())

__sklearn 이용 버전__

In [155]:
from sklearn.feature_extraction.text import TfidfVectorizer

F_tfidfv = TfidfVectorizer(min_df=1).fit(F_corpus)
F_tfidf_matrix = F_tfidfv.transform(F_corpus).toarray()

NR_tfidfv = TfidfVectorizer(min_df=1).fit(NR_corpus)
NR_tfidf_matrix = NR_tfidfv.transform(NR_corpus).toarray()

CP_tfidfv = TfidfVectorizer(min_df=1).fit(CP_corpus)
CP_tfidf_matrix = CP_tfidfv.transform(CP_corpus).toarray()

In [140]:
from sklearn.decomposition import LatentDirichletAllocation

In [162]:
def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])

In [163]:
F_LDA=LatentDirichletAllocation(n_components=2,learning_method='online',random_state=1120,max_iter=2)
F_LDA.fit_transform(F_tfidf_matrix)

F_word_set = F_tfidfv.get_feature_names() 
get_topics(F_LDA.components_,F_word_set)

Topic 1: [('court', 159.28), ('people', 125.09), ('want', 110.37), ('supreme', 109.58), ('fox', 106.13)]
Topic 2: [('president', 335.96), ('election', 236.24), ('debate', 201.57), ('state', 198.03), ('presidential', 181.21)]


In [164]:
NR_LDA=LatentDirichletAllocation(n_components=2,learning_method='online',random_state=1120,max_iter=2)
NR_LDA.fit_transform(NR_tfidf_matrix)

NR_word_set = NR_tfidfv.get_feature_names() 
get_topics(NR_LDA.components_,NR_word_set)

Topic 1: [('reuters', 136.58), ('trust', 73.89), ('thomson', 69.87), ('standard', 68.97), ('principle', 68.24)]
Topic 2: [('better', 298.21), ('vote', 234.38), ('election', 193.07), ('state', 175.08), ('voter', 164.56)]


In [165]:
CP_LDA=LatentDirichletAllocation(n_components=2,learning_method='online',random_state=1120,max_iter=2)
CP_LDA.fit_transform(CP_tfidf_matrix)

CP_word_set = CP_tfidfv.get_feature_names() 
get_topics(CP_LDA.components_,CP_word_set)

Topic 1: [('tax', 302.67), ('campaign', 276.2), ('like', 259.81), ('make', 256.97), ('zone', 256.96)]
Topic 2: [('percent', 769.91), ('voter', 745.86), ('state', 572.84), ('vote', 558.76), ('win', 471.08)]
