In [1]:
import re, itertools, os
import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', None)

import tensorflow_datasets as tfds
import nltk

TEXT_COL = 'comment_text'
SEED = 1313

In [2]:
PROJ_DIR = os.getcwd()
o_file=os.path.join(PROJ_DIR, 'data', 'revolution_twitter_corpus.csv')
concept = 'revolution'

if not os.path.exists(o_file):
    print('Creating Revolution Civil Comments Corpus')
    # Import data
    IMPORT_DIR='/Users/prl222/PycharmProjects/bias_datasets/civil_comments'
    data = pd.read_csv(os.path.join(IMPORT_DIR, 'all_data.csv'))
    print(data.shape[0])
    print(data.columns)

    # Find texts with intangigle concept

    def text_detection(text, concept):
        has_occurrence = False
        # bytes into string
        # text = text.decode()
        text = str(text)
        if re.search(r"\b" + re.escape(concept) + r"\b", text.lower()):
            has_occurrence = True
        return has_occurrence
        
    data['has_revolution'] = data[TEXT_COL].apply(lambda text:text_detection(text, concept))

    # save csv locally ordered by date
    data = data[data.has_revolution]
    data.sort_values(by='created_date').to_csv(o_file, index=False)
else:
    print('Importing Revolution Civil Comments Corpus')
    data = pd.read_csv(o_file)
    print(data.shape[0])
    print(data.columns)

data.head()

Importing Revolution Civil Comments Corpus
3047
Index(['id', 'comment_text', 'split', 'created_date', 'publication_id',
       'parent_id', 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes',
       'disagree', 'toxicity', 'severe_toxicity', 'obscene', 'sexual_explicit',
       'identity_attack', 'insult', 'threat', 'male', 'female', 'transgender',
       'other_gender', 'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual',
       'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu',
       'buddhist', 'atheist', 'other_religion', 'black', 'white', 'asian',
       'latino', 'other_race_or_ethnicity', 'physical_disability',
       'intellectual_or_learning_disability', 'psychiatric_or_mental_illness',
       'other_disability', 'identity_annotator_count',
       'toxicity_annotator_count', 'has_revolution'],
      dtype='object')


Unnamed: 0,id,comment_text,split,created_date,publication_id,parent_id,article_id,rating,funny,wow,...,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,identity_annotator_count,toxicity_annotator_count,has_revolution
0,240643,"Hillary never had a shot at getting my vote.\n\nI (like many feeling the Bern) am a life long independent, from a democrat family.\n\nMy values are left, but I'm that demographic of millenials with left leaning values that older democrats are always trying to shame into voting.\n\nAnd now that Bernie has managed to get a strategy to get those reluctant voters out, what are establishment democrats saying.\n\n""Remember, you owe Hillary your vote, since we know Bernie must lose""\n\nIf you are a democrat and you are angry that there are new voices, new voters, and that the independent voices have managed their own voting strategy which is something the DNC could NEVER match (or even figure out for that matter) \n\nThen perhaps it IS time to leave politics and turn them over to the motivated activists who are making this an actual bona fide political revolution.\n\n""Owie, revolution is hard."" \n\nUm yeah... generally speaking when we pull the country back from the brink of hell it aint easy.",train,2016-01-22 19:53:06.562506+00,6,,32516,approved,0,0,...,,,,,,,,0,6,True
1,240753,"I'm disturbed the the phrase ""...Bernie supporters and the threat they pose to the Democratic unity bridge....."" I understand that being overworked and over saturated with the politics involved in a revolution can take its toll, but I think that blaming it on Bernie supporters (as if they are any different than Hillary supporters) is unfair. It smacks of favoritism, which pretty much shoots down any ""unity"" argument.\n\nPolitics is no longer about unwavering loyalty to The Party, and absolute, unquestioning support of The One favorite within that party. It's an ""all bets are off"" revolt against the trap that is a two party system. It can be quite an adjustment to accept that no one is required to support a candidate just because of party affiliation. \n\nYes, it's hard to deal with. No, it's not supposed to be ""civil."" It's revolution. Revolutions are always disruptive and painful. At least we aren't facing down tanks in the street.",train,2016-01-23 18:36:47.971429+00,6,,32516,approved,0,0,...,,,,,,,,0,6,True
2,240754,"Exactly. We're breaking out of the comfy little box we've been stuffed into and it's got establishment Dems a little nervous. There are always those folks who want to maintain ""peace"" by submitting to the status quo, and then there are those revolutionaries who sometimes aren't as civilized and polished as the establishment want them to be.\n\nRevolution isn't supposed to be a cake walk. It's supposed to be uncomfortable. Progress is always unsettling. That doesn't make it a bad thing, except to those who have gotten too comfortable with their place in the status quo.",train,2016-01-23 18:45:42.652636+00,6,240643.0,32516,approved,0,0,...,,,,,,,,0,4,True
3,240798,Another 80 million for a corporation with billions in revenue. Money can buy lawmakers so easily. It's why the local papers criticized the lies to sell the CRC freeway then endorsed Metro President Hughes who fast-tracked it. You can thank democrats who control Salem Oregon and the corrupt union leaderships who endorse them. The only glimmer of hope is unions with member controlled endorsements will always endorse Sanders over Clinton. Until we get a revolution we have corporate control of our gov budgets http://electjoerowe.com/stimulus,train,2016-01-24 02:19:37.089278+00,6,,33302,approved,0,0,...,,,,,,,,0,4,True
4,241254,"um... .it's called a revolution for a reason... sometimes people get hurt during a revolution, if you can't accept that then you definitely shouldn't be in politics because you're bound to get some angry people on one side or another of what you believe, it looks to me you definitely will do better in another line of work. For the record though, the Bernie movement is more than just voting for the next president, it's sending a message to the corporate establishment that they no longer own the democratic party, that they can pack their bags and leave. That 15% of democrats will under no circumstance vote for Hillary- -I've signed the pact with 50,000+ others, and many who haven't signed still feel and align with us. You can vote for Bernie, or you'll get Trump.",train,2016-01-31 09:28:14.390097+00,6,,32516,approved,0,0,...,,,,,,,,0,4,True


In [3]:
# Extract features: local analysis


# RANDOM SAMPLE
sample = data.sample(n=35, random_state=SEED)
sample_texts = sample[TEXT_COL].to_list()

pos_tag_dict = {'N': ['NN', 'NNP'], 'V': ['VB', 'VBN'], 'A':['JJ']}

def get_features(text, pos_tag):
  " Return list of tokens with specific POS (pos_tag) from a text"
  tokenized_text = nltk.sent_tokenize(text)
  kwd_list = [kwd for kwd, pos 
              in nltk.pos_tag(nltk.word_tokenize(str(tokenized_text))) 
              if pos==pos_tag]
  return kwd_list


# ... text corpus
features_dict = {'text':[], 'N':[], 'V':[], 'A':[]}
for text in sample_texts:
  # for a text
  features_dict['text'].append(text)
  for feature_type, pos_tags in pos_tag_dict.items():
    # get list of kwd of each type (N, V, A)
    kwd_l = []
    for pos_tag in pos_tags:
      kwd_l.append(get_features(text, pos_tag))

    # add it to the dict
    features_dict[feature_type].append(kwd_l)


features = pd.DataFrame.from_dict(features_dict)
# features.sample(5)
features

Unnamed: 0,text,N,V,A
0,"The big part of what's missing with respect to the debates about gun control in the US is the part that relates to responsibilities. Nobody talks about that part, and the NRA seems to think it has some unalienable right to avoid anything to do with civic responsibilities and/or controls. \n\nInstead, people (and the courts) get caught up in arcane arguments about the wording of the language in the US Constitution, which is over 200 years old. That point is too prescriptive to be relevant in this day and age -- the authors didn't anticipate that. They didn't anticipate automatic weaponry and all the technological developments. They were thinking about their own revolution.\n\nSimilarly, we could be doing the same now for generations down the line in other ways.","[[[, part, respect, gun, control, part, part, right, anything, wording, language, point, day, age, weaponry, revolution, line, ]], [US, NRA, US, Constitution]]","[[think, avoid, do, be, anticipate, anticipate, be], [caught]]","[[big, unalienable, civic, arcane, old, prescriptive, relevant, automatic, technological, own, same, other]]"
1,"Look elsewhere for historical perspective. Eire completely omits the Batista coup, dictatorship and carnage, perhaps executed 20,000 countrymen that was bound to precipitate a revolution from someone capable and motivated, in this case , Fidel. I also heard Eire this moning in an interview on NPR. A Batista facist may be way striog but Eire is one with a exclusionary telling. Read and hear him out and judge for yourselves. Carlos Eite, no.","[[perspective, coup, dictatorship, carnage, revolution, someone, case, moning, interview, facist, way, telling, judge, ]], [Batista, Fidel, Eire, NPR, A, Batista, Eire, Eite]]","[[precipitate, be, hear], [bound]]","[[[, historical, capable, striog, exclusionary]]"
2,Canada is leading the AR revolution from the front. www.wrnch.com Our MTL based company already has full body AR running on the iPhone.,"[[[, revolution, front, company, body, iPhone, ]], [AR, AR]]","[[], [based]]",[[full]]
3,"Let them dive off the left wing deep end, the Marxist and their globalist games aren't anything you can rationalize or change. \nSimply put it's going to be a messy fight to the end. The Trump victory was a huge slap in the face to the alt left. One can only pray France, Germany and Italy join the revolution.","[[left, end, globalist, anything, fight, end, victory, slap, face, alt, revolution, ]], [Marxist, Simply, Trump, France, Germany, Italy]]","[[Let, dive, rationalize, change, be, pray], []]","[[deep, messy, huge]]"
4,"Do you honestly believe there is a difference at this point? I think I know where you are going with this. Increasing privatization is a large part of what has gotten us into this mess (& ""Libertarian Think Tank"" is an oxymoron - sorry, this is pretty easy to support). \n\nAll corporations care about is money & profit, which ends up going to an ever smaller percentage of people because of greed [full stop]. Perhaps you like working 60-80 hour weeks more productively for less pay & fewer benefits, but I sure don't, and this doesn't serve this country or business well in the long run (a view history supports -- check out what FDR accomplished and how he did it someday). This faddish selfish objectivism philosophy will ultimately result in anarchy, chaos, & revolution if we aren't careful.\n\nTo quote Lemony Snicket ""Historically, a story about people inside impressive buildings ignoring or even taunting people standing outside shouting at them turns out to be a story with an unhappy ending.""","[[[, difference, point, privatization, part, mess, sorry, money, profit, percentage, greed, stop, ], hour, pay, country, business, run, view, history, someday, objectivism, philosophy, anarchy, chaos, revolution, story, story, ]], [Think, Tank, [, FDR, Lemony, Snicket, Historically]]","[[support, serve, check, result, be], [gotten]]","[[large, Libertarian, oxymoron, pretty, easy, full, 60-80, long, faddish, selfish, careful, impressive, outside, unhappy]]"
5,"Because the basis of PC is education of the recalcitrant masses to purify society of any vestiges of gender, racial, religious, class bias, then as in the Chinese Cultural Revolution has graphically illustrated, we better be on our toes.\nCriticism and skepticism, especially of collective societal thinking, is necessary for vibrant, diverse, intellectual discourse in the pursuit of a better inclusive society. \nIn my humble opinion, PC is a movement that cultivates people to think about their assumptions and beliefs, encourages them not to cuddle together in the comfort of a obsolete past, which is good, but PC also skirts on the absolutism of some religions creating sheep instead of thinkers.\nStay on your toes.","[[[, basis, PC, education, recalcitrant, society, gender, class, bias, 'Criticism, skepticism, thinking, vibrant, diverse, discourse, pursuit, society, opinion, PC, movement, comfort, past, absolutism, ]], [Cultural, Revolution, PC]]","[[purify, be, think, cuddle, 'Stay], [illustrated]]","[[racial, religious, Chinese, collective, societal, necessary, intellectual, inclusive, humble, obsolete, good, sheep]]"
6,"From ""Nature's God: The Heretical Origins of the American Republic Paperback, by Matthew Stewart – July 6, 2015\n\n""It was Jefferson, too, who invoked “Nature’s God” in the Declaration of Independence’s first sentence. But this was not “the fictitious, meddling deity of the religious imagination but . . . nature itself or the universe comprehended as a whole. It is a way of talking about God long after God is dead.” This is Nature as God, the “presiding deity of the American Revolution."" \n\nAnd: ""When he stepped down from his position as commander of the Continental Army in 1783, Washington made a point of reminding his countrymen: “The foundation of our empire was not laid in the gloomy age of Ignorance and Superstition.” \nhttps://www.bostonglobe.com/arts/books/2014/07/19/review-nature-god-the-heretical-origins-american-republic-matthew-stewart/qjGOjlN2aS9haqXldb5joN/story.html\n\nhttp://www.ideasroadshow.com/issues/matthew-stewart-2014-06-13","[[[, sentence, deity, imagination, nature, universe, whole, way, ”, “, deity, position, commander, point, foundation, empire, age, \nhttps, ]], [Nature\, God, Heretical, Origins, American, Republic, Paperback, Matthew, Stewart, –, July, Jefferson, Nature, ’, God, ”, Declaration, Independence, ’, God, God, Nature, God, Revolution, Continental, Army, Washington, Ignorance, Superstition., ”]]","[[“], [laid]]","[[“, first, fictitious, religious, dead., American, gloomy, //www.bostonglobe.com/arts/books/2014/07/19/review-nature-god-the-heretical-origins-american-republic-matthew-stewart/qjGOjlN2aS9haqXldb5joN/story.html\n\nhttp, //www.ideasroadshow.com/issues/matthew-stewart-2014-06-13]]"
7,"""We need a revolution in education that indigenizes the system""\n\ncould not agree LESS. This would keep young native people at a disadvantage","[[[, revolution, education, system, disadvantage, ]], [LESS]]","[[agree, keep], []]","[[young, native]]"
8,"That's fine GC, for those of us who have the luxury to wait for democratically elected socialism (never, so far in Canadian history.) What Cuba got was revolution with all the problems that follow, particularly because of the nature of revolutionaries who gain power. But Cubans were in no position to wait and would have been fools to work patiently given their political system. They couldn't and still can afford the luxury of our democractic sensitivities.","[[[, luxury, socialism, history, revolution, nature, power, position, wait, system, luxury, ]], [GC, Cuba]]","[[wait, have, work, afford], [elected, been, given]]","[[fine, Canadian, political, democractic]]"
9,"The Dems would like to co-opt the Sanders people, but they are mostly sticking to Sanders' revolution movement and/or going Green. The organizing is being done organically online. But there are many spokes in the wheel of change and The more people pushing for progressive change the better at this point.","[[[, revolution, movement, and/or, wheel, change, change, point, ]], [Dems, Sanders, Green]]","[[like, co-opt], [done]]","[[online, many, progressive]]"


In [4]:
# ... temporal analysis? Compare smt with historical panflets.

import dateutil.parser
from datetime import datetime

def detect_datetime(string):
    d = dateutil.parser.parse(string)
    return d

# ... transform in df
# data['created_date'] = data['created_date'].apply(lambda date: detect_datetime(date))

ex_date =data.loc[0, 'created_date']

d = dateutil.parser.parse(ex_date)
print(type(d))
print(d)

<class 'datetime.datetime'>
2016-01-22 19:53:06.562506+00:00


In [5]:
import nltk
from nltk.collocations import *

# Extract features: global analysis
ex_input_text = features.loc[1,'text']
print(ex_input_text)

def get_concept_collocations(input_text, concept):
    # Text collocations with word Revolution
    tokens = nltk.wordpunct_tokenize(input_text.lower())
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    trigram_measures = nltk.collocations.TrigramAssocMeasures()
    ## Bigrams and Trigrams
    finder2 = BigramCollocationFinder.from_words(tokens)
    finder3 = TrigramCollocationFinder.from_words(tokens)
    def _return_ngrams(ngram_measures, finder, concept):
        # Ngrams with 'Revolution' as a member
        finder.apply_ngram_filter(lambda *w: concept not in w)
        # return n-grams with  PMI
        return finder.score_ngrams(ngram_measures.likelihood_ratio)
    
    return _return_ngrams(bigram_measures, finder2, concept), _return_ngrams(trigram_measures, finder3, concept)

ex_bigrams, ex_trigrams = get_concept_collocations(ex_input_text, concept)
print(f'{ex_bigrams}\n{ex_trigrams}')


Look elsewhere for historical perspective. Eire completely omits the Batista coup, dictatorship and carnage, perhaps executed 20,000 countrymen that was bound to precipitate a revolution from someone capable and motivated, in this case , Fidel. I also heard Eire this moning in an interview on NPR.   A Batista facist may be way striog but Eire is one with a exclusionary telling. Read and hear him out and judge for yourselves.  Carlos Eite, no.
[(('revolution', 'from'), 10.873491397677103), (('a', 'revolution'), 7.054406387908225)]
[(('revolution', 'from', 'someone'), 21.746982795354203), (('a', 'revolution', 'from'), 17.92789778558536), (('precipitate', 'a', 'revolution'), 17.92789778558536)]


In [15]:
# Collocations by time frames: 3 time frames
id_list = data.id.to_list()
n = len(id_list)
print(f'Total of {n} texts')

# 2016
data_1 = data.loc[data.created_date<'2017-01-01 00:00:00.000000+00',]
# 2017
data_2 = data.loc[(data.created_date>'2017-01-01 00:00:00.000000+00')&(data.created_date<'2018-01-01 00:00:00.000000+00'),]

corpus_1_list = data_1.loc[:,'comment_text'].to_list()
corpus_2_list = data_2.loc[:,'comment_text'].to_list()


Total of 3047 texts


In [18]:
# First time frame
print(f'2016 Data: {len(corpus_1_list)}')
print(data_1.loc[0,'created_date'])
print(data_1.loc[len(corpus_1_list)-1,'created_date'])

corpus_1 = ' '.join(corpus_1_list)


bigrams_1, trigrams_1 = get_concept_collocations(corpus_1, concept)

topn=40
print(f'Bigrams:\n{[x[0] for x in bigrams_1[:topn]]}')
print(f'Trigrams:\n{[x[0] for x in trigrams_1[:topn]]}')

2016 Data: 707
2016-01-22 19:53:06.562506+00
2016-12-31 18:31:52.201535+00
Bigrams:
[('industrial', 'revolution'), ('american', 'revolution'), ('revolution', '.'), ('a', 'revolution'), ('french', 'revolution'), ('cultural', 'revolution'), ('sexual', 'revolution'), ('political', 'revolution'), ('the', 'revolution'), ('revolution', 'against'), ('revolution', ','), ('revolution', 'the'), ('communist', 'revolution'), ('cuban', 'revolution'), ('revolution', '"'), ('counter', 'revolution'), ('iranian', 'revolution'), ('.', 'revolution'), ('peaceful', 'revolution'), ('armed', 'revolution'), ('russian', 'revolution'), ('revolution', 'in'), ('revolution', 'was'), ('violent', 'revolution'), ('bolshevik', 'revolution'), ('socialist', 'revolution'), (',', 'revolution'), ('agricultural', 'revolution'), ('digital', 'revolution'), ('quiet', 'revolution'), ('revolution', 'will'), ('to', 'revolution'), ('color', 'revolution'), ('and', 'revolution'), ('revolution', 'when'), ('revolution', '!'), ('bloody

In [20]:
# Second time frame
print(f'2017 Data: {len(corpus_2_list)}')
print(data_2.iloc[0,3])
print(data_2.iloc[-1,3])


corpus_2 = ' '.join(corpus_2_list)


bigrams_2, trigrams_2 = get_concept_collocations(corpus_2, concept)

print(f'Bigrams:\n{[x[0] for x in bigrams_2[:topn]]}')
print(f'Trigrams:\n{[x[0] for x in trigrams_2[:topn]]}')

2017 Data: 2340
2017-01-01 11:02:35.521137+00
2017-11-11 00:30:15.010936+00
Bigrams:
[('a', 'revolution'), ('industrial', 'revolution'), ('revolution', '.'), ('american', 'revolution'), ('french', 'revolution'), ('cultural', 'revolution'), ('quiet', 'revolution'), ('sexual', 'revolution'), ('russian', 'revolution'), ('revolution', 'the'), ('revolution', ','), ('revolution', 'against'), ('violent', 'revolution'), ('bolshevik', 'revolution'), (',', 'revolution'), ('revolution', 'in'), ('.', 'revolution'), ('revolution', 'halfway'), ('revolution', 'was'), ('glorious', 'revolution'), ('green', 'revolution'), ('the', 'revolution'), ('to', 'revolution'), ('revolution', '?'), ('democratic', 'revolution'), ('sense', 'revolution'), ('counter', 'revolution'), ('peaceful', 'revolution'), ('revolution', 'and'), ('technological', 'revolution'), ('revolution', '”.'), ('revolution', 'a'), ('iranian', 'revolution'), ('revolution', 'is'), ('revolution', 'inevitable'), ('revolution', '"'), ('political',