In [1]:
import pandas as pd

In [2]:
%config Completer.use_jedi = False

In [3]:
from ast import literal_eval
from nltk.corpus import stopwords

In [4]:
from collections import Counter

In [5]:
from utils.basic_tokenize import tokenize

In [6]:
from gensim.utils import simple_preprocess

In [7]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [8]:
df = pd.read_csv(r"D:\data\openparliament\df_climate.csv")
df.drop('Unnamed: 0', axis=1, inplace=True)

In [9]:
df['text_en']= df.text_en.apply(lambda x: x.replace('Mr. Speaker,',' '))

In [10]:
df['tokens'] = df.tokenized_content.apply(literal_eval)

In [11]:
%%time
df['tokens'] = df.text_en.apply(lambda x: simple_preprocess(x, deacc=True))

Wall time: 5.17 s


In [12]:
stop_words = set(stopwords.words('english'))

In [13]:
df['tokens'] = df.tokens.apply(lambda x: [w for w in x if not w in stop_words])

In [14]:
df.tokens.head()

0    [last, week, six, million, canadians, across, ...
1    [last, week, six, million, canadians, across, ...
2    [last, week, six, million, canadians, across, ...
3    [privilege, able, speak, amendments, bill, pro...
4    [privilege, able, speak, amendments, bill, pro...
Name: tokens, dtype: object

In [15]:
def bigrams(sentence):
    part1 = [i+ "_"+j for i,j in zip(sentence[::2], sentence[1::2])]
    part2 = [i+ "_"+j for i,j in zip(sentence[1::2], sentence[2::2])]
    
    part1.extend(part2)
    return part1

In [16]:
a = ['a','b','c']
b = [1,2,3]

In [17]:
list(zip(a,b))

[('a', 1), ('b', 2), ('c', 3)]

In [18]:
df['bigrams'] = df.tokens.apply(bigrams)

In [19]:
df.bigrams.head()

0    [last_week, six_million, canadians_across, cou...
1    [last_week, six_million, canadians_across, cou...
2    [last_week, six_million, canadians_across, cou...
3    [privilege_able, speak_amendments, bill_propos...
4    [privilege_able, speak_amendments, bill_propos...
Name: bigrams, dtype: object

In [20]:
bigram_counts = dict()
template_counter = Counter()
pids = list(set(df.politician_id))
for pol in pids:
    bigram_counts[pol] = template_counter.copy()

In [21]:
def update_counts(row):
    bigrams = row.loc['bigrams']
    if bigram_counts.get(row.loc['politician_id'])==None:
        bigram_counts[row.loc['politician_id']] = template_counter.copy()
        bigram_counts[row.loc['politician_id']].update(bigrams)
    else:
        bigram_counts[row.loc['politician_id']].update(bigrams)

In [22]:
df.apply(update_counts,axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
19655    None
19656    None
19657    None
19658    None
19659    None
Length: 19660, dtype: object

In [23]:
for key, counter in bigram_counts.items():
    print('politician #:', key)
    print(counter.most_common(4))

politician #: 10240
[('following_guidance', 1), ('federal_court', 1), ('appeal_move', 1), ('process_forward', 1)]
politician #: 1
[('new_government', 8), ('clean_air', 8), ('canada_new', 8), ('kootenay_columbia', 4)]
politician #: 2
[('member_happy', 4), ('know_article', 4), ('bases_question', 4), ('inaccurate_fact', 4)]
politician #: 5
[('carbon_tax', 22), ('prime_minister', 12), ('climate_change', 12), ('liberal_carbon', 6)]
politician #: 6
[('clean_air', 126), ('hon_member', 110), ('greenhouse_gases', 80), ('air_act', 80)]
politician #: 7
[('carbon_tax', 52), ('federal_government', 36), ('rejects_carbon', 32), ('hon_colleague', 28)]
politician #: 9
[('across_country', 48), ('oil_sands', 46), ('member_opposite', 40), ('canadian_jobs', 24)]
politician #: 10
[('green_algae', 21), ('blue_green', 21), ('federal_government', 12), ('laundry_detergents', 10)]
politician #: 11
[('prime_minister', 7), ('global_warming', 6), ('serious_issue', 5), ('food_crisis', 5)]
politician #: 12
[('st_lawr

load politician info

In [24]:
pol_info = pd.read_csv(r"D:\data\openparliament\politician_info.csv",index_col='Unnamed: 0')

note: how to sum arbitrarily many counters

In [25]:
pol_group = pol_info.groupby('party_id')

In [26]:
conservatives = df[df.party_id==1].politician_id.unique()
conservative_bigrams = sum([bigram_counts[p] for p in conservatives],Counter())
conservative_bigrams.most_common(15)

[('carbon_tax', 2682),
 ('greenhouse_gas', 1985),
 ('climate_change', 1816),
 ('gas_emissions', 1730),
 ('prime_minister', 1400),
 ('greenhouse_gases', 834),
 ('minister_environment', 771),
 ('reduce_greenhouse', 753),
 ('liberal_government', 720),
 ('hon_member', 703),
 ('liberal_party', 690),
 ('united_states', 616),
 ('would_like', 519),
 ('clean_air', 469),
 ('environment_minister', 401)]

In [27]:
bigram_counts[conservatives[0]].most_common(3)

[('climate_change', 111), ('co_operation', 72), ('prime_minister', 69)]

In [28]:
liberals = df[df.party_id==4].politician_id.unique()
liberal_bigrams = sum([bigram_counts[p] for p in liberals],Counter())
liberal_bigrams.most_common(15)

[('climate_change', 2934),
 ('prime_minister', 1183),
 ('hon_member', 789),
 ('minister_environment', 759),
 ('greenhouse_gas', 556),
 ('gas_emissions', 449),
 ('environmental_assessment', 415),
 ('would_like', 384),
 ('price_pollution', 381),
 ('united_states', 326),
 ('kyoto_protocol', 310),
 ('government_canada', 299),
 ('federal_government', 298),
 ('conservative_government', 268),
 ('fight_climate', 254)]

In [29]:
greens = df[df.party_id==9].politician_id.unique()
green_bigrams = sum([bigram_counts[p] for p in greens],Counter())
green_bigrams.most_common(15)

[('climate_change', 357),
 ('prime_minister', 218),
 ('fee_dividend', 165),
 ('carbon_fee', 150),
 ('greenhouse_gas', 87),
 ('minister_environment', 79),
 ('price_carbon', 77),
 ('greenhouse_gases', 75),
 ('parliamentary_secretary', 54),
 ('environmental_assessment', 50),
 ('thunder_bay', 48),
 ('let_us', 47),
 ('gas_emissions', 46),
 ('madam_speaker', 46),
 ('climate_emergency', 42)]

In [30]:
t = df.tokens.iloc[0]
    
    

In [31]:
t = t[0:4]

In [32]:
'_'.join(t)

'last_week_six_million'

In [33]:
def create_ngram(tokens,n):
    ngrams = []
    for i in range(len(tokens)-n):
        ngram = '_'.join(tokens[i:i+n])
        ngrams.append(ngram)
    
    return ngrams
    

In [34]:
n_gram_dict = dict()

In [35]:
for pol in pids:
    n_gram_dict[pol] = template_counter.copy()

In [36]:
def update_ngrams(row):
    tokens = row.loc['tokens']
    ngrams  = create_ngram(tokens,4)
    if n_gram_dict.get(row.loc['politician_id'])==None:
        n_gram_dict[row.loc['politician_id']] = template_counter.copy()
        n_gram_dict[row.loc['politician_id']].update(ngrams)
    else:
        n_gram_dict[row.loc['politician_id']].update(ngrams)

In [37]:
df.apply(update_ngrams,axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
19655    None
19656    None
19657    None
19658    None
19659    None
Length: 19660, dtype: object

In [38]:
n_gram_dict[173].most_common(4)

[('canadian_environmental_bill_rights', 6),
 ('establish_canadian_environmental_bill', 4),
 ('canada_student_loans_act', 4),
 ('crown_liability_proceedings_act', 4)]

In [39]:
green_4grams = sum([n_gram_dict[p] for p in greens],Counter())
green_4grams.most_common(15)

[('thunder_bay_superior_north', 27),
 ('national_marine_conservation_area', 18),
 ('carbon_fee_dividend_would', 15),
 ('equal_per_capita_basis', 15),
 ('lake_superior_national_marine', 15),
 ('superior_national_marine_conservation', 15),
 ('framework_convention_climate_change', 14),
 ('global_average_temperature_increase', 14),
 ('reduce_greenhouse_gas_emissions', 14),
 ('minister_environment_climate_change', 13),
 ('national_round_table_environment', 12),
 ('round_table_environment_economy', 12),
 ('preventing_dangerous_climate_change', 12),
 ('parliamentary_secretary_minister_environment', 9),
 ('canadian_environmental_assessment_act', 9)]

In [40]:
conservative_4grams = sum([n_gram_dict[p] for p in conservatives],Counter())
conservative_4grams.most_common(15)

[('reduce_greenhouse_gas_emissions', 495),
 ('reducing_greenhouse_gas_emissions', 172),
 ('job_killing_carbon_tax', 93),
 ('canadian_environmental_assessment_act', 91),
 ('reduction_greenhouse_gas_emissions', 90),
 ('sector_sector_regulatory_approach', 88),
 ('canadian_environmental_protection_act', 65),
 ('global_greenhouse_gas_emissions', 64),
 ('navigable_waters_protection_act', 62),
 ('plan_reduce_greenhouse_gas', 61),
 ('greenhouse_gas_emissions_canada', 50),
 ('greenhouse_gas_emissions_government', 48),
 ('reductions_greenhouse_gas_emissions', 45),
 ('first_time_canadian_history', 43),
 ('canada_clean_air_act', 43)]

In [41]:
liberal_4grams = sum([n_gram_dict[p] for p in liberals],Counter())
liberal_4grams.most_common(15)

[('reduce_greenhouse_gas_emissions', 109),
 ('canadian_environmental_assessment_act', 85),
 ('minister_environment_climate_change', 80),
 ('canadian_environmental_protection_act', 70),
 ('reducing_greenhouse_gas_emissions', 64),
 ('plan_fight_climate_change', 55),
 ('take_action_climate_change', 52),
 ('committee_environment_sustainable_development', 41),
 ('standing_committee_environment_sustainable', 39),
 ('environment_economy_go_together', 38),
 ('would_like_thank_member', 36),
 ('canadian_environmental_assessment_agency', 34),
 ('round_table_environment_economy', 31),
 ('yukon_environmental_socio_economic', 31),
 ('environmental_socio_economic_assessment', 31)]

In [42]:
party_colour_dict = {28:'conservative',25: 'conservative', 1:'conservative', 46:'quebec', 26: 'conservative',2: 'NDP',5: 'independent',
                4: 'liberal', 9: 'green', 3: 'bloc', 39: 'quebec'}

In [43]:
df['party'] = df.party_id.replace(party_colour_dict)

In [44]:
for key, counter in n_gram_dict.items():
    print('politician #:', key, "political party:",df[df.politician_id==key].party.iloc[0])
    print(counter.most_common(4))

politician #: 10240 political party: liberal
[('following_guidance_federal_court', 1), ('guidance_federal_court_appeal', 1), ('federal_court_appeal_move', 1), ('court_appeal_move_process', 1)]
politician #: 1 political party: conservative
[('kootenay_columbia_one_fantastic', 4), ('columbia_one_fantastic_areas', 4), ('one_fantastic_areas_earth', 4), ('fantastic_areas_earth_majestic', 4)]
politician #: 2 political party: conservative
[('member_happy_know_article', 4), ('happy_know_article_bases', 4), ('know_article_bases_question', 4), ('article_bases_question_inaccurate', 4)]
politician #: 5 political party: conservative
[('minister_transport_infrastructure_communities', 4), ('emitters_like_china_india', 4), ('like_china_india_united', 4), ('china_india_united_states', 4)]
politician #: 6 political party: conservative
[('canada_clean_air_act', 26), ('first_time_canadian_history', 24), ('every_industry_sector_across', 16), ('industry_sector_across_country', 16)]
politician #: 7 political

politician #: 262 political party: liberal
[('bill_act_provide_harmonization', 3), ('act_provide_harmonization_environmental', 3), ('provide_harmonization_environmental_standards', 3), ('harmonization_environmental_standards_throughout', 3)]
politician #: 264 political party: liberal
[('hope_conservative_party_finally', 2), ('today_celebrating_world_environment', 1), ('celebrating_world_environment_day', 1), ('world_environment_day_year', 1)]
politician #: 4361 political party: NDP
[('canadian_environmental_assessment_act', 4), ('rise_today_speak_bill', 2), ('opposing_bill_voting_second', 2), ('bill_voting_second_reading', 2)]
politician #: 2313 political party: liberal
[('supporting_government_house_divided', 1), ('government_house_divided_motion', 1), ('house_divided_motion_agreed', 1), ('divided_motion_agreed_following', 1)]
politician #: 266 political party: conservative
[('western_canadians_loss_explain', 2), ('canadians_loss_explain_environment', 2), ('loss_explain_environment_mi

politician #: 8828 political party: conservative
[('quite_clear_leader_ndp', 1), ('clear_leader_ndp_made', 1), ('leader_ndp_made_carbon', 1), ('ndp_made_carbon_tax', 1)]
politician #: 2693 political party: liberal
[('usual_bloc_quebecois_quick', 4), ('bloc_quebecois_quick_exaggerate', 4), ('quebecois_quick_exaggerate_cida', 4), ('quick_exaggerate_cida_viewed', 4)]
politician #: 4743 political party: liberal
[('certainly_pleased_program_ottawa', 1), ('pleased_program_ottawa_carleton', 1), ('program_ottawa_carleton_region', 1), ('ottawa_carleton_region_put', 1)]
politician #: 2698 political party: liberal
[('corporate_entity_responsible_department', 2), ('entity_responsible_department_national', 2), ('responsible_department_national_defence', 2), ('department_national_defence_greatly', 2)]
politician #: 2703 political party: NDP
[('parliamentary_secretary_minister_environment', 15), ('situ_oil_sands_projects', 9), ('standing_committee_environment_sustainable', 8), ('committee_environment

politician #: 1813 political party: liberal
[('canada_soon_participate_st', 2), ('soon_participate_st_united', 2), ('participate_st_united_nations', 2), ('st_united_nations_framework', 2)]
politician #: 1816 political party: NDP
[('national_pollutant_release_inventory', 4), ('incinerator_white_birch_paper', 2), ('white_birch_paper_mill', 2), ('birch_paper_mill_stadacona', 2)]
politician #: 3866 political party: liberal
[('cold_day_like_today', 2), ('day_like_today_able', 2), ('like_today_able_observe', 2), ('today_able_observe_various', 2)]
politician #: 3870 political party: NDP
[('member_windsor_st_clair', 2), ('commented_need_review_entire', 2), ('entire_environmental_assessment_process', 2), ('clearly_correctly_demonstrated_yesterday', 1)]
politician #: 10014 political party: liberal
[('aware_situation_asked_department', 1), ('situation_asked_department_additional', 1), ('asked_department_additional_briefings', 1), ('department_additional_briefings_additional', 1)]
politician #: 38

In [45]:
df.wordcount.describe()

count    19660.000000
mean       134.036673
std        259.079767
min          0.000000
25%         69.000000
50%         86.000000
75%        107.000000
max       6496.000000
Name: wordcount, dtype: float64

In [46]:
df.iloc[19659]

index                                                           853086
id                                                                4579
politician_id                                                    11678
riding_id                                                        46011
party_id                                                             2
end_date                                                            \N
start_date                                                  2019-10-21
id.1                                                           2833005
document_id                                                      20718
time                                            2020-02-26 15:45:00-05
h1_en                                              Routine Proceedings
h2_en                                     Climate Emergency Action Act
member_id                                                         4579
who_en                           Ms. Leah Gazan (Winnipeg Centre, NDP)
conten