In [1]:
import glob 
import json
from tqdm.notebook import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Tweet Scraper

In [3]:
# !cd /content/drive/My\ Drive/Save\ the\ Children\
#   && cd TweetScraper/ && pip install --user -r requirements.txt && sh install.sh


In [4]:
# !cd /content/drive/My\ Drive/Save\ the\ Children/TweetScraper
# !scrapy crawl TweetScraper -a query="Afghanistan"

### Upload data to google drive for easy access in colab

In [5]:
# rename scraped tweet files as json so parsing is more clear later on
# !rename 's/$/.json/' /content/drive/My\ Drive/Save\ the\ Children/Data/tweet/*

### Construct DataFrame with scraped tweets

In [6]:
files = glob.glob('/content/drive/My Drive/Save the Children/Data/tweet/*')
df = pd.DataFrame()
retweets, hashtags, geo, full_text = [], [], [], []

for filename in tqdm(files):
  with open(filename) as f:
    j = json.load(f)
    full_text.append(j['raw_data']['full_text'])
    hashtags.append({item['text'] for item in j['raw_data']['entities']['hashtags']})
    retweets.append(j['raw_data']['retweet_count'])
    geo.append(j['raw_data']['geo'])

df['full_text'] = full_text
df['hashtags'] = hashtags
df['retweets'] = retweets
df['geo'] = geo

HBox(children=(FloatProgress(value=0.0, max=1094.0), HTML(value='')))




In [7]:
# Visualize dataframe
df

Unnamed: 0,full_text,hashtags,retweets,geo
0,"Around the country, millions of children live...","{Afghanistan, EducationForAll}",4,
1,Sneak peek! The #2019CPMS Summary just arrived...,"{FutureLinksInCP, CPMS, 2019CPMS}",3,
2,@TsarKastik @RT_com @JewRussophile they are us...,{},0,
3,@FatihinFKM @TsarKastik @RT_com @JewRussophile...,{},0,
4,"@CPiE_Global Thanks for this initiative , it c...",{},0,
...,...,...,...,...
1089,"""What happened after the Soviets pulled out of...",{},103,
1090,Children in #Afghanistan have access to #human...,"{humanitarian, Afghanistan, HumDevNexus, Leave...",3,
1091,"Articles and more.... Storie, racconti, recens...","{thecircuscomestothevillage, Afghanistan, flut...",0,
1092,"”People are suffering, people are dying, entir...",{},61087,


## Setup NER tagger to extract important entities from tweets

In [8]:
# Extract stanford-ner data
!unzip /content/drive/My\ Drive/Save\ the\ Children/stanford-ner-4.0.0.zip

Archive:  /content/drive/My Drive/Save the Children/stanford-ner-4.0.0.zip
   creating: stanford-ner-4.0.0/
  inflating: stanford-ner-4.0.0/ner-gui.sh  
  inflating: stanford-ner-4.0.0/build.xml  
  inflating: stanford-ner-4.0.0/stanford-ner.jar  
  inflating: stanford-ner-4.0.0/sample-conll-file.txt  
  inflating: stanford-ner-4.0.0/README.txt  
  inflating: stanford-ner-4.0.0/NERDemo.java  
  inflating: stanford-ner-4.0.0/sample.ner.txt  
  inflating: stanford-ner-4.0.0/ner.sh  
  inflating: stanford-ner-4.0.0/LICENSE.txt  
   creating: stanford-ner-4.0.0/lib/
  inflating: stanford-ner-4.0.0/lib/joda-time.jar  
  inflating: stanford-ner-4.0.0/lib/jollyday-0.4.9.jar  
  inflating: stanford-ner-4.0.0/lib/stanford-ner-resources.jar  
  inflating: stanford-ner-4.0.0/stanford-ner-4.0.0-sources.jar  
  inflating: stanford-ner-4.0.0/sample.txt  
  inflating: stanford-ner-4.0.0/ner-gui.command  
  inflating: stanford-ner-4.0.0/ner.bat  
  inflating: stanford-ner-4.0.0/ner-gui.bat  
  inflati

In [9]:
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
# Download the NER tagger at the following url:
# https://nlp.stanford.edu/software/CRF-NER.shtml#Download

# Note: change the path in the initialization below based on where the ner tagger was extracted to
st = StanfordNERTagger('stanford-ner-4.0.0/classifiers/english.all.3class.distsim.crf.ser.gz',
					   'stanford-ner-4.0.0/stanford-ner.jar',
					   encoding='utf-8')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordNERTagger, self).__init__(*args, **kwargs)


In [10]:
tagged_items = []

for text in tqdm(df['full_text']):
  tokenized_text = word_tokenize(text)
  classified_text = st.tag(tokenized_text)
  tagged_items.append([item for item in classified_text if item[1] != 'O'])

df['tags'] = tagged_items

HBox(children=(FloatProgress(value=0.0, max=1094.0), HTML(value='')))




In [11]:
df.to_csv('/content/drive/My Drive/Save the Children/tweet_data.csv')