In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [8]:
from matplotlib import rc
rc('font', family='NanumGothicOTF')
plt.rcParams['axes.unicode_minus'] = False

In [9]:
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords  
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
import nltk

#nltk.data.path.append('C:/임시/nltk_data/')

In [10]:
from sentiment_lexicon import Lexicon
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

__데이터 로오오오드__

In [11]:
path='D:/세미 1차 텍스트 마이닝/formats/csvs/'
df = pd.read_csv(path+"news.csv")
df = df.drop(df[(df["Date"] == "2020-11-01") | (df["Date"] == "2020-11-02")].index)
df["Date"] = pd.to_datetime(df["Date"])

__각 기사들을 문장토큰화__

In [12]:
df["Text"] = df["Text"].apply(lambda x: sent_tokenize(x))

__모든 문장들 다 쓰지 말고 트럼프나 바이든 언급 문장들만 쓰자__

In [13]:
def only_Trump_or_Biden_sentence(element,search):
    
    if search not in ['Trump','Biden']:
        print('wrong key')
        return
    
    ret_list = []
    for sentence in element:
        if search in sentence:
            ret_list.append(sentence)
     
    return ret_list

In [14]:
df['Trump_sentences'] = df['Text'].apply(lambda x : only_Trump_or_Biden_sentence(x,'Trump'))
df['Biden_sentences'] = df['Text'].apply(lambda x : only_Trump_or_Biden_sentence(x,'Biden'))

__불용어,불필요글자,소문자통일,등등__

In [15]:
def proper_tokenize(element):

    ret_list = []
    
    for sentence in element:
        clean_words=[]
        sentence = re.sub('[^a-zA-Z-]+',' ',sentence)

        for word in word_tokenize(sentence):
            if len(word)>2 and word.lower() not in stopWords:
                if word not in ['Trump','Biden']:
                    word = word.lower()
                clean_words.append(word)

        ret_list.append(clean_words)
    return ret_list

In [16]:
stopWords = set(stopwords.words('english'))

In [17]:
df['Trump_sentences'] = df['Trump_sentences'].apply(proper_tokenize)
df['Biden_sentences'] = df['Biden_sentences'].apply(proper_tokenize)

__표제어 추출__

In [18]:
WL = WordNetLemmatizer()

In [19]:
def proper_lemmatize(word,tag):
    
    if tag[0]=='V':
        ret= WL.lemmatize(word,'v')
    elif tag[0]=='J':
        ret= WL.lemmatize(word,'a')
    elif tag[0]=='NNP':
        ret= word
    else:
        ret= WL.lemmatize(word)
        
    return ret if ret not in stopWords else ''


def apply_lemmetize(element):
    
    for i in range(len(element)):
        element[i] = [proper_lemmatize(word,tag) for (word,tag) in pos_tag(element[i])]
        
    return element        

In [20]:
df['Trump_sentences'] = df['Trump_sentences'].apply(apply_lemmetize)
df['Biden_sentences'] = df['Biden_sentences'].apply(apply_lemmetize)

__역토큰화__

In [21]:
df['Trump_sentences'] = df['Trump_sentences'].apply(lambda x : [*map(lambda y: ' '.join(y) , x)])
df['Biden_sentences'] = df['Biden_sentences'].apply(lambda x : [*map(lambda y: ' '.join(y) , x)])

__갬성 분석__

In [22]:
def get_sentiment_mean(element):
    
    size = len(element)
    ret_dict = {'neg': 0, 'neu': 0, 'pos': 0, 'compound': 0}
    
    if size:
        for sentence in element:
            score_dict = analyser.polarity_scores(sentence)
            for key in score_dict:
                ret_dict[key]+=score_dict[key]

        for key in ret_dict:
            ret_dict[key] = ret_dict[key]/size
        
    return ret_dict

In [23]:
analyser = SentimentIntensityAnalyzer()

In [24]:
Trump_score_df = pd.DataFrame(df['Trump_sentences'].apply(get_sentiment_mean).values.tolist()).add_prefix('Trump_')
Biden_score_df = pd.DataFrame(df['Biden_sentences'].apply(get_sentiment_mean).values.tolist()).add_prefix('Biden_')

In [25]:
df = pd.concat([df,Trump_score_df,Biden_score_df],axis=1)
df.head(10)

Unnamed: 0,Title,Date,Press,Topic,Text,Trump_sentences,Biden_sentences,Trump_neg,Trump_neu,Trump_pos,Trump_compound,Biden_neg,Biden_neu,Biden_pos,Biden_compound
0,Newt Gingrich: Democrats prove they are the 'L...,2020-09-01,Fox,2020 Presidential Election,[Fox News Flash top entertainment and celebrit...,[election year story scandals outrageous momen...,[election year story scandals outrageous momen...,0.089643,0.741071,0.169143,0.072971,0.054308,0.774538,0.171154,0.221808
1,Ed Rollins says election could come down to Wi...,2020-09-01,Fox,2020 Presidential Election,[White House Strategic Communications Director...,[white house strategic communication director ...,[white house strategic communication director ...,0.0885,0.7445,0.16725,0.382975,0.074667,0.702667,0.223,0.6446
2,Anti-Trump Lincoln Project launches coalition ...,2020-09-01,Fox,ELECTIONS,"[House Minority Leader Kevin McCarthy, R-Calif...",[house minority leader kevin mccarthy r-calif ...,[go city last month president different part c...,0.026,0.793,0.181,0.3712,0.066,0.811,0.123,0.3182
3,Biden’s tough tone on riots comes amid attacks...,2020-09-01,Fox,ELECTIONS,[Steve Hilton highlights President Trump's ful...,[steve hilton highlight president Trump fulfil...,[crunch time president Trump opponent former v...,0.070545,0.812909,0.116545,0.104764,0.028889,0.821889,0.149222,0.242411
4,NY attorney general files injunction to stop T...,2020-09-02,Fox,ELECTIONS,[President Trump holds a ‘Making America Great...,[president Trump hold make america great rally...,[],0.11175,0.65825,0.23,0.4052,0.0,0.0,0.0,0.0
5,Biden rips Trump over school reopening struggl...,2020-09-02,Fox,ELECTIONS,"[Former Arkansas Gov., Mike Huckabee discusses...",[mike huckabee discuss presidential candidate ...,[mike huckabee discuss presidential candidate ...,0.089857,0.762714,0.147571,0.1792,0.115455,0.740182,0.144455,0.123673
6,"Wisconsin donors, especially in Kenosha, suppo...",2020-09-02,Fox,2020 Presidential Election,[Trump 2020 campaign senior adviser gives host...,[Trump campaign senior adviser give hostage re...,[],0.03875,0.683,0.2785,0.410325,0.0,0.0,0.0,0.0
7,Biden campaign mocked for presence in 'Animal ...,2020-09-02,Fox,2020 Presidential Election,[Former House Speaker breaks down the vice pre...,[president Trump electoral vote four year ago ...,[former house speaker break vice presidential ...,0.143,0.771,0.086,-0.13695,0.015,0.861429,0.123571,0.3387
8,How to safely vote during the coronavirus pand...,2020-09-02,Fox,2020 Presidential Election,[Will Cain has the latest FOX Bet Super 6 cont...,[],[],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,David Bossie: Kenosha visit shows Trump defend...,2020-09-02,Fox,2020 Presidential Election,"[‘The Five’ discusses how president, Democrati...",[five discus president democratic nominee Bide...,[five discus president democratic nominee Bide...,0.053,0.8845,0.0625,-0.12695,0.080667,0.877667,0.041667,-0.1833


In [26]:
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from plotly.validators.scatter.marker import SymbolValidator

In [27]:
df['Trump_sentiment']=np.where(df['Trump_compound']>=0.05,1, np.where(df['Trump_compound']<=-0.05,-1,0))
df['Biden_sentiment']=np.where(df['Biden_compound']>=0.05,1, np.where(df['Biden_compound']<=-0.05,-1,0))

In [28]:
Trump_senti_groupby_date = df.groupby('Date')[['Trump_neg','Trump_neu','Trump_pos','Trump_compound','Trump_sentiment']].mean()
Biden_senti_groupby_date = df.groupby('Date')[['Biden_neg','Biden_neu','Biden_pos','Biden_compound','Biden_sentiment']].mean()

In [31]:
Trump_fig = px.bar(Trump_senti_groupby_date, x=Trump_senti_groupby_date.index, y="Trump_sentiment")
Biden_fig = px.bar(Biden_senti_groupby_date, x=Biden_senti_groupby_date.index, y="Biden_sentiment")

Trump_fig.update_layout(title_text='News about Trump')
Biden_fig.update_layout(title_text='News about Biden')

Trump_fig.show()
Biden_fig.show()