In [5]:
import pandas as pd
import numpy as np
import re

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
import matplotlib.pyplot as plt
from PIL import Image
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from plotly. validators.scatter.marker import SymbolValidator
from wordcloud import WordCloud 

__경로 설정__

In [8]:
path = '파일 경로'

__데이터 로드__

In [None]:
import ast

from_biden_corpus = pd.read_csv(path+'전처리 완료 파일명')
to_biden_corpus = pd.read_csv(path+'전처리 완료 파일명')
from_trump_corpus = pd.read_csv(path+'전처리 완료 파일명')
to_trump_corpus = pd.read_csv(path+'전처리 완료 파일명')

from_biden_corpus = ast.literval_eval(from_biden_corpus)
to_biden_corpus = ast.literval_eval(from_biden_corpus)
from_trump_corpus = ast.literval_eval(from_biden_corpus)
to_trump_corpus = ast.literval_eval(from_biden_corpus)

## TF Wordcloud

In [None]:
fbc_text=[] # from_biden_corpus
tbc_text=[] # to_biden_corpus
ftc_text=[] # from_trump_corpus
ttc_text=[] # to_trump_corpus

vector_fbc = CountVectorizer()
vector_tbc = CountVectorizer()
vector_ftc = CountVectorizer()
vector_ttc = CountVectorizer()

# 문장 단위의 코퍼스를 전체 문장으로 결합
fbc_text.append(' '.join(from_biden_corpus))
tbc_text.append(' '.join(to_biden_corpus))
ftc_text.append(' '.join(from_trump_corpus))
ttc_text.append(' '.join(to_trump_corpus))

# 문서가 1개 짜리인 단어 콜랙션 만들기
fbc=vector_fbc.fit_transform(fbc_text)
tbc=vector_tbc.fit_transform(tbc_text)
ftc=vector_ftc.fit_transform(ftc_text)
ttc=vector_ttc.fit_transform(ttc_text)

fb=pd.DataFrame(fbc.toarray(), columns=vector_fbc.get_feature_names()).T
ft=pd.DataFrame(ftc.toarray(), columns=vector_ftc.get_feature_names()).T
tb=pd.DataFrame(tbc.toarray(), columns=vector_tbc.get_feature_names()).T
tt=pd.DataFrame(ttc.toarray(), columns=vector_ttc.get_feature_names()).T

fb.rename(columns={0:'frequency'}, inplace=True)
ft.rename(columns={0:'frequency'}, inplace=True)
tb.rename(columns={0:'frequency'}, inplace=True)
tt.rename(columns={0:'frequency'}, inplace=True)

fb=fb.to_dict()
ft=ft.to_dict()
tb=tb.to_dict()
tt=tt.to_dict()

#### 이미지 마스크

In [None]:
path_image= '이미지 경로'

In [None]:
biden_mask = np.array(Image.open(path_image+"바이든 이미지 파일명"))
trump_mask = np.array(Image.open(path_image+"트럼프 이미지 파일명"))
crowd_mask = np.array(Image.open(path_image+"대중 이미지 파일명"))

#### 바이든 트위터 워드클라우드

In [None]:
fbTF = WordCloud(relative_scaling = 0.2,
                 background_color='white',
                 mask=biden_mask,
                 contour_width=2,
                 contour_color='darkblue'
                     ).generate_from_frequencies(fb['frequency'])
plt.figure(figsize=(15,10))
plt.imshow(fbTF)
plt.axis("off")
plt.show()

#### 트럼프 트위터 워드클라우드

In [None]:
ftTF = WordCloud(relative_scaling = 0.2,
                 background_color='white',
                 mask=trump_mask,
                 contour_width=2,
                 contour_color='darkred'
                     ).generate_from_frequencies(ft['frequency'])
plt.figure(figsize=(15,10))
plt.imshow(ftTF)
plt.axis("off")
plt.show()

#### 사람들이 바이든에게 보낸 트위터 워드클라우드

In [None]:
tbTF = WordCloud(relative_scaling = 0.2,
                 background_color='white',
                 mask=crowd_mask,
                 contour_width=1,
                 contour_color='darkblue'
                     ).generate_from_frequencies(tb['frequency'])
plt.figure(figsize=(15,10))
plt.imshow(tbTF)
plt.axis("off")
plt.show()

#### 사람들이 트럼프에게 보낸 트위터 워드클라우드

In [None]:
ttTF = WordCloud(relative_scaling = 0.2,
                 background_color='white',
                 mask=crowd_mask,
                 contour_width=1,
                 contour_color='darkred'
                     ).generate_from_frequencies(tt['frequency'])
plt.figure(figsize=(15,10))
plt.imshow(ttTF)
plt.axis("off")
plt.show()

## TF-IDF Wordcloud

In [None]:
def TF_IDF(corpus):
    tfidfv = TfidfVectorizer(min_df=2).fit(corpus)
    return tfidfv

from_biden_tfidf=TF_IDF(from_biden_corpus)
to_biden_tfidf=TF_IDF(to_biden_corpus)
from_trump_tfidf=TF_IDF(from_trump_corpus)
to_trump_tfidf=TF_IDF(to_trump_corpus)

from_biden_TFIDF_matrix = from_biden_tfidf.transform(from_biden_corpus).toarray()
to_biden_TFIDF_matrix = to_biden_tfidf.transform(to_biden_corpus).toarray()
from_trump_TFIDF_matrix = from_trump_tfidf.transform(from_trump_corpus).toarray()
to_trump_TFIDF_matrix = to_trump_tfidf.transform(to_trump_corpus).toarray()

from_biden_word_index = {idx:word for (word,idx) in from_biden_tfidf.vocabulary_.items()}
to_biden_word_index = {idx:word for (word,idx) in to_biden_tfidf.vocabulary_.items()}
from_trump_word_index = {idx:word for (word,idx) in from_trump_tfidf.vocabulary_.items()}
to_trump_word_index = {idx:word for (word,idx) in to_trump_tfidf.vocabulary_.items()}

from_biden_TFIDF_sum = from_biden_TFIDF_matrix.sum(axis=0)
to_biden_TFIDF_sum = to_biden_TFIDF_matrix.sum(axis=0)
from_trump_TFIDF_sum = from_trump_TFIDF_matrix.sum(axis=0)
to_trump_TFIDF_sum = to_trump_TFIDF_matrix.sum(axis=0)

from_biden_TFIDF_frequencies = {from_biden_word_index[i]:from_biden_TFIDF_sum[i] for i in range(len(from_biden_tfidf.vocabulary_))}
to_biden_TFIDF_frequencies = {to_biden_word_index[i]:to_biden_TFIDF_sum[i] for i in range(len(to_biden_tfidf.vocabulary_))}
from_trump_TFIDF_frequencies = {from_trump_word_index[i]:from_trump_TFIDF_sum[i] for i in range(len(from_trump_tfidf.vocabulary_))}
to_trump_TFIDF_frequencies = {to_trump_word_index[i]:to_trump_TFIDF_sum[i] for i in range(len(to_trump_tfidf.vocabulary_))}

####  바이든 트위터 TF워드클라우드

In [None]:
from_biden_wordcloud = WordCloud(relative_scaling = 0.2,
                     background_color='lightblue',
                     ).generate_from_frequencies(from_biden_TFIDF_frequencies)
plt.figure(figsize=(15,10))
plt.imshow(from_biden_wordcloud)
plt.axis("off")
plt.show()

####  트럼프 TF워드클라우드

In [None]:
from_trump_wordcloud = WordCloud(relative_scaling = 0.2,
                     background_color='red',
                     ).generate_from_frequencies(from_trump_TFIDF_frequencies)
plt.figure(figsize=(15,10))
plt.imshow(from_trump_wordcloud)
plt.axis("off")
plt.show()

####  바이든에게 보낸 트위터 TF워드클라우드

In [None]:
to_biden_wordcloud = WordCloud(relative_scaling = 0.2,
                     background_color='lightblue',
                     ).generate_from_frequencies(to_biden_TFIDF_frequencies)
plt.figure(figsize=(15,10))
plt.imshow(to_biden_wordcloud)
plt.axis("off")
plt.show()

#### 트럼프에게 보낸 트위터 TF워드클라우드

In [None]:
to_trump_wordcloud = WordCloud(relative_scaling = 0.2,
                     background_color='red',
                     ).generate_from_frequencies(to_trump_TFIDF_frequencies)
plt.figure(figsize=(15,10))
plt.imshow(to_trump_wordcloud)
plt.axis("off")
plt.show()

## PLOTLY

####  데이터 처리

In [None]:
fb_df=pd.DataFrame(fbc.toarray(), columns=vector_fbc.get_feature_names()).T
ft_df=pd.DataFrame(ftc.toarray(), columns=vector_ftc.get_feature_names()).T
tb_df=pd.DataFrame(tbc.toarray(), columns=vector_tbc.get_feature_names()).T
tt_df=pd.DataFrame(ttc.toarray(), columns=vector_ttc.get_feature_names()).T

In [None]:
fb_df.reset_index(inplace=True)
ft_df.reset_index(inplace=True)
tb_df.reset_index(inplace=True)
tt_df.reset_index(inplace=True)

In [None]:
fb_df.rename(columns={'index':'Word', 0:'Frequency'}, inplace=True)
ft_df.rename(columns={'index':'Word', 0:'Frequency'}, inplace=True)
tb_df.rename(columns={'index':'Word', 0:'Frequency'}, inplace=True)
tt_df.rename(columns={'index':'Word', 0:'Frequency'}, inplace=True)

fb_df.sort_values(by='Frequency', ascending=False, inplace=True)
ft_df.sort_values(by='Frequency', ascending=False, inplace=True)
tb_df.sort_values(by='Frequency', ascending=False, inplace=True)
tt_df.sort_values(by='Frequency', ascending=False, inplace=True)

####  TF plot

In [None]:
fb_df_fig = px.bar(fb_df[:20], x='Word',y='Frequency', color='Frequency', color_continuous_scale=px.colors.sequential.Cividis_r, title='From Biden\'s Twitter by Term Frequency')
ft_df_fig = px.bar(ft_df[:20], x='Word',y='Frequency',color='Frequency', color_continuous_scale=["green", "purple", "red"], title='From Trump\'s Twitter by Term Frequency')
tb_df_fig = px.bar(tb_df[:20], x='Word',y='Frequency',color='Frequency', color_continuous_scale=px.colors.sequential.Cividis_r, title='To Biden\'s Twitter by Term Frequency')
tt_df_fig = px.bar(tt_df[:20], x='Word',y='Frequency',color='Frequency', color_continuous_scale=["green", "purple", "red"], title='To Trump\'s Twitter by Term Frequency')

fb_df_fig.show()
ft_df_fig.show()
tb_df_fig.show()
tt_df_fig.show()

#### TF-IDF plot

In [None]:
from_biden_plot=pd.DataFrame(from_biden_TFIDF_frequencies, index=['frequency']).T
to_biden_plot=pd.DataFrame(to_biden_TFIDF_frequencies, index=['frequency']).T
from_trump_plot=pd.DataFrame(from_trump_TFIDF_frequencies, index=['frequency']).T
to_trump_plot=pd.DataFrame(to_trump_TFIDF_frequencies, index=['frequency']).T

In [None]:
from_biden_plot.reset_index(inplace=True)
to_biden_plot.reset_index(inplace=True)
from_trump_plot.reset_index(inplace=True)
to_trump_plot.reset_index(inplace=True)

In [None]:
from_biden_plot.rename(columns={'index':'Word', 'frequency':'Frequency'}, inplace=True)
to_biden_plot.rename(columns={'index':'Word', 'frequency':'Frequency'}, inplace=True)
from_trump_plot.rename(columns={'index':'Word', 'frequency':'Frequency'}, inplace=True)
to_trump_plot.rename(columns={'index':'Word', 'frequency':'Frequency'}, inplace=True)

In [None]:
from_biden_plot.sort_values(by='Frequency', ascending=False, inplace=True)
to_biden_plot.sort_values(by='Frequency', ascending=False, inplace=True)
from_trump_plot.sort_values(by='Frequency', ascending=False, inplace=True)
to_trump_plot.sort_values(by='Frequency', ascending=False, inplace=True)

In [None]:
from_biden_plot_fig = px.bar(from_biden_plot[:20], x='Word',y='Frequency', color='Frequency', color_continuous_scale=px.colors.sequential.Cividis_r, title='From Biden\'s Twitter by TF-IDF')
to_biden_plot_fig = px.bar(to_biden_plot[:20], x='Word',y='Frequency',color='Frequency', color_continuous_scale=px.colors.sequential.Cividis_r, title='To Biden\'s Twitter by TF-IDF')
from_trump_plot_fig = px.bar(from_trump_plot[:20], x='Word',y='Frequency',color='Frequency', color_continuous_scale=["green", "purple", "red"], title='From Trump\'s Twitter by TF-IDF')
to_trump_plot_fig = px.bar(to_trump_plot[:20], x='Word',y='Frequency',color='Frequency', color_continuous_scale=["green", "purple", "red"], title='To Trump\'s Twitter by TF-IDF')


from_biden_plot_fig.show()
to_biden_plot_fig.show()
from_trump_plot_fig.show()
to_trump_plot_fig.show()

# 감성분석

In [None]:
from sentiment_lexicon import Lexicon
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    print("{:-<40} {}".format(sentence, str(score)))

객체 생성

In [None]:
analyser = SentimentIntensityAnalyzer()

분석용 개별 데이터 프레임 생성

In [None]:
to_biden_corpus_df=pd.DataFrame(to_biden_corpus, columns=['sent_corpus'])
to_trump_corpus_df=pd.DataFrame(to_trump_corpus, columns=['sent_corpus'])

to_biden_df=pd.read_csv(path+"원본 데이터 파일명")
to_trump_df=pd.read_csv(path+"원본 데이터 파일명")
new_to_biden_df=pd.concat([to_biden_df[['date','text']],to_biden_corpus_df], axis=1)
new_to_trump_df=pd.concat([to_trump_df[['date','text']],to_trump_corpus_df], axis=1)

감정 점수 계산 및 저장

In [None]:
# 🥶 biden
new_to_biden_temp =[]
analyzer = SentimentIntensityAnalyzer()
for sentence in new_to_biden_df['sent_corpus']:
    vs = analyzer.polarity_scores(sentence)
    new_to_biden_temp.append("{:-<65} {}".format(sentence, str(vs)))
    
# 🤬 trump
new_to_trump_temp =[]
analyzer = SentimentIntensityAnalyzer()
for sentence in new_to_trump_df['sent_corpus']:
    vs = analyzer.polarity_scores(sentence)
    new_to_trump_temp.append("{:-<65} {}".format(sentence, str(vs)))    

항목별 감정 점수 딕셔너리 변환 및 저장

In [None]:
# 🥶 biden
df_to_biden_senti = pd.DataFrame(new_to_biden_temp)
temp_biden = []
for text in df_to_biden_senti[0]:
    senten = text.split("{")
    for p in senten:
        point = p.split(",")
    res = {"sentence" : senten[0], "neg": point[0][6:], "neu": point[1][7:], "pos": point[2][7:], "compound" : point[3][12:-2]}
    temp_biden.append(res)
    

# 🤬 trump
df_to_trump_senti = pd.DataFrame(new_to_trump_temp)
temp_trump = []
for text in df_to_trump_senti[0]:
    senten = text.split("{")
    for p in senten:
        point = p.split(",")
    res = {"sentence" : senten[0], "neg": point[0][6:], "neu": point[1][7:], "pos": point[2][7:], "compound" : point[3][12:-2]}
    temp_trump.append(res)

- positive sentiment: compound score >= 0.05
- neutral sentiment: (compound score > -0.05) and (compound score < 0.05)
- negative sentiment: compound score <= -0.05

In [None]:
# 🥶 biden
df_temp_biden = pd.DataFrame(temp_biden)
df_temp_biden[["neg", "neu", "pos","compound"]] = df_temp_biden[["neg", "neu", "pos","compound"]].astype("float32")

# 🤬 trump
df_temp_trump = pd.DataFrame(temp_trump)
df_temp_trump[["neg", "neu", "pos","compound"]] = df_temp_trump[["neg", "neu", "pos","compound"]].astype("float32")

In [None]:
to_trump_senti=pd.concat([new_to_trump_df,df_temp_trump], axis=1)
to_biden_senti=pd.concat([new_to_biden_df,df_temp_biden], axis=1)

In [None]:
to_biden_senti['sentiment']=np.where(to_biden_senti['compound']>=0.05,1, np.where(to_biden_senti['compound']<=-0.05,-1,0))
to_trump_senti['sentiment']=np.where(to_trump_senti['compound']>=0.05,1, np.where(to_trump_senti['compound']<=-0.05,-1,0))

In [None]:
to_trump_senti_by_date=to_trump_senti.groupby('date')[['neg','neu','pos','compound','sentiment']].mean()
to_biden_senti_by_date=to_biden_senti.groupby('date')[['neg','neu','pos','compound','sentiment']].mean()

to_trump_senti_by_date.to_csv("트럼프 트위터 감성정수 파일명", index=False)
to_biden_senti_by_date.to_csv("바이든 트위터 감성정수 파일명", index=False)

## 감성분석 Plot

__일별 감성 점수 추이 시각화__

In [None]:
trump_line_fig = px.line(to_trump_senti_by_date, x=to_trump_senti_by_date.index, y=to_trump_senti_by_date['sentiment'], title='Sentiment towards Trump')
biden_line_fig = px.line(to_biden_senti_by_date, x=to_biden_senti_by_date.index, y=to_biden_senti_by_date['sentiment'], title='Sentiment towards Biden')

trump_line_fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
)
biden_line_fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
)
trump_line_fig.show()
biden_line_fig.show()

__두 후보의 개별 감성 점수 바 그래프__

In [None]:
trump_bar_stock_fig = px.bar(to_trump_senti_by_date, x=to_trump_senti_by_date.index, y="sentiment")
biden_bar_stock_fig = px.bar(to_biden_senti_by_date, x=to_biden_senti_by_date.index, y="sentiment")
trump_bar_stock_fig.show()
biden_bar_stock_fig.show()

__두 후보의 바 그래프 통합 시각화__

In [None]:
trump_biden_sentimen = go.Figure()
trump_biden_sentimen.add_trace(go.Bar(
    x=to_trump_senti_by_date.index,
    y=to_trump_senti_by_date.sentiment,
    name='Sentiment towards Trump',
    marker_color='indianred'
))
trump_biden_sentimen.add_trace(go.Bar(
    x=to_biden_senti_by_date.index,
    y=to_biden_senti_by_date.sentiment,
    name='Sentiment towards Biden',
    marker_color='darkblue'
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
trump_biden_sentimen.update_layout(barmode='group', xaxis_tickangle=-45, width=2000, height=800)
trump_biden_sentimen.show()

__Pos , Neg , Neu , Compound 및 감성 분류 항목별 통합 추이 그래프__

In [None]:
to_trump_senti_by_date.columns.name='emotion'
to_biden_senti_by_date.columns.name='emotion'

In [None]:
trump_area_fig = px.area(to_trump_senti_by_date, facet_col="emotion", facet_col_wrap=2, title='Emotion towards Trump')
biden_area_fig = px.area(to_biden_senti_by_date, facet_col="emotion", facet_col_wrap=2, title='Emotion towards Biden')
trump_area_fig.show()
biden_area_fig.show()