## Exploratory Sentiment Analysis of Wikipedia Events from 1945 - 2017
- Hye Chang [hyechang@uchicago.edu](mailto:hyechang@uchicago.edu)
- I scrape Wikipedia to collect major events that happen between 1945 and 2017, aggregate text by year and process them using pandas and NLTK, conduct a basic sentiment analysis using NLTK, and visualize the trend over the years using Plotly.

### Aggregate Major Events by Year Using Wikipedia Data
##### Uses the [Wikipedia](https://github.com/goldsmith/Wikipedia) python wrapper by Jonathan Goldsmith

In [1]:
import wikipedia as wiki
import re
import pandas as pd

MONTHS = ['January', 'February', 'March', 'April',
          'May', 'June', 'July', 'August',
          'September', 'October', 'November', 'December']

def get_year_month_events(year):
    '''
    Parses Wikipedia page for given year and creates a list of events
    Inputs:
        (string) of year
    Returns:
        (list) of (tuple) of the form (year, month, event)
    '''

    page = wiki.WikipediaPage(title=year)
    content = page.content
    events = re.search("== Events ==(.|\n)*== Births ==", content)
    if events is None:
        events = re.search("== Events ==(.|\n)*== Deaths ==", content)
    events = events.group()

    event_list = []
    for month in MONTHS:
        lines = re.finditer("(({}) [0-9 ]*(–|-) )[^\n]*".format(month), events)
        for line in lines:
            actual_event = re.search("(–|-) .*", line.group(0)).group()[2:]
            event_list.append((year, month, actual_event))

    return event_list


def get_all_events(start, end):
    '''
    Gathers list of all events for given date range
    Inputs: a pair of (integers)
    Returns:
        (list) of (tuples) of form (year, month, event)
    '''

    all_events = []
    for i in range(start, end + 1):
        year = str(i)
        all_events += get_year_month_events(year)
    return all_events


def create_events_df(start, end):
    '''
    Creates dataframe of all events for givent timeframe
    Inputs: pair of (integers) - start year, end year
    Returns:
        pandas (dataframe) with columns 'Year', 'Month' and 'Event'
    '''

    events_list = get_all_events(start, end)
    events_df = pd.DataFrame(events_list, columns=['Year', 'Month', 'Event'])
    
    return events_df

### Scrape Data for Years 1945 - 2017

In [2]:
edf = create_events_df(1945, 2017)

In [3]:
edf.head()

Unnamed: 0,Year,Month,Event
0,1945,January,WWII: Allied advance from Paris to the Rhine c...
1,1945,January,WWII:
2,1945,January,Australia recognizes the Polish Committee of N...
3,1945,January,"A German offensive recaptures Esztergom, Hunga..."
4,1945,January,WWII: British General Bernard Montgomery holds...


### Conduct Basic Sentiment Analysis on Text by Year
#### Uses NLTK Python Package

In [4]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer



In [5]:
def process_text(text):
    '''
    Processes a string and returns a list of corresponding clean tokens.
    Inputs: (string)
    Returns: (list) of tokens
    ''' 
    
    text.lower()   
    
    # Tokenize and Remove Punctuations
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    
    # Stem
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(t) for t in tokens]
    
    # Remove Stop Words
    clean_tokens = [t for t in tokens if t not in stopwords.words('english')]
    clean_tokens = list(set(clean_tokens))

    return clean_tokens

In [6]:
def sentiment_polarity(tokens):
    '''
    Calculates basic sentiment scores from list of tokens
    Inputs: (list) of tokens
    Returns: (dictionary) of 'compound', 'pos', 'neg', 'neu' and their corresponding scores
    '''

    sid = SentimentIntensityAnalyzer()
    s = ' '.join(tokens)
    sentiments = sid.polarity_scores(s)
    
    # Normalize by Number of Tokens
    for sent in sentiments:
        sentiments[sent] = sentiments[sent] / len(tokens)
    
    return sentiments

In [7]:
def aggregate_yearly(events_df):
    '''
    Takes events dataframe and computes yearly tokens and sentiments from those tokens
    Inputs: (dataframe) of events
    Returns: (dataframe) of tokens and sentiment scores per year
    '''
    
    events_df['Tokens'] = events_df.Event.apply(process_text)
    token_df = events_df.groupby('Year').Tokens.sum().to_frame()
    token_df['scores'] = token_df.Tokens.apply(sentiment_polarity)
    
    return token_df

In [8]:
tdf = aggregate_yearly(edf)

In [9]:
tdf.head()

Unnamed: 0_level_0,Tokens,scores
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1945,"[pari, alli, wwii, unit, armi, state, line, si...","{'neg': 8.171206225680933e-05, 'neu': 0.000525..."
1946,"[1937, alli, zone, recogn, occup, austrian, bo...","{'neg': 0.0001512027491408935, 'neu': 0.000924..."
1947,"[first, unit, state, time, congress, televis, ...","{'neg': 0.00012396006655574044, 'neu': 0.00066..."
1948,"[first, prime, shwe, unit, Nu, union, U, burma...","{'neg': 7.766990291262136e-05, 'neu': 0.000691..."
1949,"[war, 1947, divis, kashmir, 2018, ceasefir, pa...","{'neg': 7.907742998352554e-05, 'neu': 0.000685..."


In [10]:
def final_df(token_df):
    '''
    Widens Data and Creates Final DF Ready for Visualization
    Inputs: (dataframe) of tokens and sentiment scores per year
    Returns: (dataframe) of sentiments and their scores by year
    '''
    
    order = ('neg', 'neu', 'pos','compound')
    new_list = []
    for year, row in token_df.iterrows():
        yearly_sents = [year]
        for sentiment in order:
            yearly_sents.append(row.scores[sentiment])
        new_list.append(yearly_sents)
    
    df = pd.DataFrame(new_list, columns=['Year', 'Negative', 'Neutral', 'Positive', 'Compound'])
    
    return df

In [11]:
final = final_df(tdf)

In [12]:
final.head()

Unnamed: 0,Year,Negative,Neutral,Positive,Compound
0,1945,8.2e-05,0.000525,4.2e-05,-0.000648
1,1946,0.000151,0.000924,7e-05,-0.001142
2,1947,0.000124,0.000668,4e-05,-0.000831
3,1948,7.8e-05,0.000692,4e-05,-0.000806
4,1949,7.9e-05,0.000685,6e-05,-0.000815


### Visualize Data with Plotly

In [21]:
import plotly.offline as py
import plotly.graph_objs as go

import math

py.init_notebook_mode(connected=True)


def scatter_trace(sentiment, df):
    '''
    Generate line trace for each sentiment
    '''
    
    df = df[['Year', sentiment]].sort_values(by='Year')
    trace = go.Scatter(x=df['Year'],
                       y=df[sentiment].apply(math.log),
                       name=sentiment,
                       mode='lines',
                       line=dict(shape='spline'))
    return trace


def scatter_plot(df):
    '''
    Plot Data
    '''
    
    data = [scatter_trace(f, df) for f in ['Negative', 'Neutral', 'Positive']]
    layout = go.Layout(title="Evolution of Sentiment: 1945 - 2017",
                    xaxis=dict(title='Year',
                               rangeslider=dict()),
                    yaxis=dict(title='Log of Sentiment Score'))
    fig = go.Figure(data=data, layout=layout)

    py.iplot(fig)

In [22]:
scatter_plot(final)