In [1]:
import pickle
import pandas as pd
import os
import numpy as np
import json 

# make dataframe of articles scraped on colab using twitter outlinks

In [2]:
# open the dic made in colab. The outlink from each tweet in the tweet_df is the key
# the headline, section, article text and expanded url are the values
# we only use outlinks for tweets where the tweet is differnt from the article headline, which we 
# found out be merging tweet_df and articles_df. Would be a waste of time to scrape those outlinks as 
# we already linked the tweet to article
with open('../data/import_data/cnn_dic', 'rb') as f:
    dic1=json.load(f)

In [3]:
outlink_art_df=pd.DataFrame.from_dict(dic1, orient = 'index', columns=['expURL', 'category', 'headline', 'date', 'artText'])

In [4]:
outlink_art_df.reset_index(inplace=True)

In [5]:
outlink_art_df.rename(columns={'index':'shortURL', 'text':'artText'}, inplace=True)

# link tweets df to the df above (one article maybe linked to by more than one tweet)

In [6]:
cnn_tweets = pd.read_csv('../data/cnn_tweets_df.csv', converters={"Outlinks": lambda x: x.strip("[]").replace("'","").split(", ")}, low_memory=False)

In [7]:
cnn_tweets=cnn_tweets.explode('Outlinks')

In [8]:
cnn_tweets.rename(columns={'Text': 'tweetText', 'headline':'tweetText','Outlinks': 'shortURL', 'Replies':'replies', 'Retweets': 'retweets', 'Likes':'likes'}, inplace=True)

In [9]:
tweet_sc_art_join = outlink_art_df.merge(cnn_tweets, on='shortURL', how='outer')

In [10]:
tweet_sc_art_join['Datetime'] = tweet_sc_art_join['Datetime'].apply(lambda x: x[:10])

In [11]:
tweet_sc_art_join.drop('date', axis=1, inplace=True)

In [12]:
tweet_sc_art_join.rename(columns={'Datetime': 'date'}, inplace=True)

In [13]:
tweet_sc_art_join.drop_duplicates(inplace=True)

In [14]:
tweet_sc_art_join.reset_index(drop=True, inplace=True)

# make another df linking the scraped articles and tweet where summary is same as headline

In [15]:
with open('../data/cnn_articles_df', 'rb') as f:
    cnn_scrape_articles=pickle.load(f)

In [16]:
#need to clean tweets to get rid of hyperlink
cnn_tweets.Datetime=cnn_tweets.Datetime.apply(lambda x: x[:10])
cnn_tweets.rename(columns={'Datetime':'date'}, inplace=True)

In [17]:
def text_cleaner(x):
    if x!='blank':
        x = x.partition('http')
        x=x[0].strip()
    return x

In [18]:
cnn_tweets['tweetText']=cnn_tweets['tweetText'].apply(text_cleaner)

In [19]:
cnn_tw_art = cnn_scrape_articles.merge(cnn_tweets, left_on='headline', right_on='tweetText')

In [20]:
cnn_tw_art.rename(columns={'URL':'expURL', 'text':'artText', 'date_x':'date', 'Text':'tweetText', 'Replies':'replies', 'Retweets':'retweets', 'Likes':'likes'}, inplace=True)

In [21]:
cnn_tw_art.drop('date_y', axis=1, inplace=True)

In [22]:
full_join = pd.concat([tweet_sc_art_join, cnn_tw_art])

In [23]:
full_join.reset_index(drop=True, inplace=True)

In [29]:
#full join is essentially reconstructing the original tweets dataframe, adding the articles. 
# we did this by linking rows where tweet was same as headline (cnn_tw_art) and by retrieving 
# articles where they weren't by opening the hyperlink in the tweet and scraping article (tweet_sc_art_join). 
full_join

Unnamed: 0,shortURL,expURL,category,headline,artText,date,tweetText,replies,retweets,likes
0,https://cnn.it/2BVsec3,https://www.cnn.com/2020/07/02/europe/russia-v...,europe,Vladimir Putin could rule Russia until 2036. H...,(CNN)Russian President Vladimir Putin is alre...,2020-07-02,Russian President Vladimir Putin is already on...,375,239,744
1,https://cnn.it/2DTz9Cb,https://www.facebook.com/AndersonCooperFullCir...,blank,Anderson Cooper Full Circle - Meet the top 10 ...,,2018-11-22,Happy Thanksgiving! Anderson introduces this y...,43,38,162
2,https://cnn.it/2LWecu0,https://www.cnn.com/2018/08/03/us/detroit-offi...,us,Detroit officer suspended after video shows hi...,(CNN)A Detroit police officer has been suspen...,2018-08-03,A Detroit police officer has been suspended af...,63,125,188
3,https://cnn.it/3lvOQTu,https://www.cnn.com/2021/09/18/us/coronavirus-...,us,Lions and tigers at DC's National Zoo test pre...,(CNN)Animal keepers at the Smithsonian's Nati...,2021-09-18,Animal keepers at Smithsonian's National Zoo i...,151,160,558
4,http://cnn.it/2bMrIxL,https://money.cnn.com/2016/08/29/technology/ap...,technology,Apple's next iPhone will likely be unveiled Se...,,2016-08-29,Apple is expected to announce the latest iPhon...,36,423,459
...,...,...,...,...,...,...,...,...,...,...
316213,https://cnn.it/3aFR2Ro,https://www.cnn.com/2020/03/09/health/cpr-livi...,health,"CPR has improved, here's what to do (and sing)...",(CNN)It only takes two hands to save a life. ...,2020-03-09T08:34:14Z,"CPR has improved, here's what to do (and sing)...",18,193,484
316214,http://cnn.it/2hA9uQ0,http://www.cnn.com/2016/12/23/politics/us-isra...,politics,What the UN vote says about how Trump will tre...,Washington (CNN)It's Donald Trump and Benjamin...,2016-12-24T00:15:56Z,What the UN vote says about how Trump will tre...,117,102,170
316215,http://cnn.it/2q6NQtk,http://www.cnn.com/2017/05/11/us/new-orleans-c...,us,New Orleans begins to remove second Confederat...,"(CNN)As police stood between opposing crowds,...",2017-05-11T08:58:33Z,New Orleans begins to remove second Confederat...,95,116,433
316216,http://cnn.it/2q5Vm7N,http://www.cnn.com/2017/05/11/us/new-orleans-c...,us,New Orleans begins to remove second Confederat...,"(CNN)As police stood between opposing crowds,...",2017-05-11T08:58:33Z,New Orleans begins to remove second Confederat...,81,102,381


# Make df of all articles, joining original scrape with twitter links

In [24]:
# we have the original scraped article df, but we may have found more when scraping from the tweet hyperlinks
# need to concat the two dataframes and drop duplicates 
cnn_scrape_articles.drop_duplicates(subset='headline', inplace=True)

In [25]:
cnn_scrape_articles.reset_index(drop=True, inplace=True)

In [26]:
cnn_scrape_articles.rename(columns={'text':'artText', 'URL':'expURL'}, inplace=True)

In [27]:
article_analysis = pd.concat([cnn_scrape_articles, full_join])

In [28]:
article_analysis.drop(['replies', 'retweets', 'likes', 'tweetText', 'shortURL'], axis=1, inplace=True)

In [29]:
article_analysis.drop_duplicates(subset=['headline', 'artText'], inplace=True)

In [30]:
article_analysis.reset_index(drop=True, inplace=True)

In [32]:
article_analysis=article_analysis.dropna(subset=['artText'])

In [33]:
index_names = article_analysis[article_analysis['artText'] == 'blank' ].index
article_analysis.drop(index_names, inplace=True)

In [41]:
index_names = article_analysis[article_analysis['artText'] == '' ].index
article_analysis.drop(index_names, inplace=True)

In [None]:
#for i, r in article_analysis.iterrows():
 #   if not r['artText'] or r['artText'] =='blank':
  #      c+=1
   #     print(i)
    #    article_analysis.drop(i, axis=0, inplace=True)
        

3
12
18
35
36
58
59
76
89
91
95
98
104
109
119
122
133
134
138
148
152
154
204
210
221
261
262
264
265
296
298
324
337
382
389
392
410
427
476
489
498
506
515
521
548
554
573
579
580
582
606
625
628
654
709
743
806
812
826
842
843
858
865
873
881
896
935
951
953
955
961
964
966
1014
1047
1060
1062
1104
1108
1112
1125
1132
1138
1139
1166
1174
1199
1205
1216
1225
1232
1233
1236
1278
1286
1294
1297
1299
1311
1320
1324
1333
1338
1344
1347
1372
1378
1388
1402
1412
1418
1428
1435
1436
1442
1455
1475
1476
1490
1491
1496
1513
1518
1519
1532
1555
1597
1609
1614
1618
1621
1629
1645
1660
1664
1680
1685
1686
1707
1714
1716
1718
1723
1740
1742
1759
1783
1795
1798
1815
1816
1831
1832
1836
1840
1848
1862
1879
1884
1890
1939
1946
1980
1991
2000
2001
2009
2038
2041
2049
2051
2052
2056
2064
2072
2083
2089
2094
2131
2140
2141
2172
2180
2193
2213
2238
2239
2243
2255
2279
2280
2292
2294
2316
2324
2332
2333
2335
2338
2339
2343
2344
2355
2371
2377
2395
2399
2419
2421
2423
2424
2449
2450
2452
2470
2475
2496
2

In [43]:
article_analysis.reset_index(drop=True, inplace=True)

In [44]:
article_analysis.drop_duplicates(subset='headline', inplace=True)

In [45]:
article_analysis=article_analysis.dropna(subset=['artText'])

In [46]:
article_analysis.reset_index(drop=True, inplace=True)

In [47]:
with open('../data/cnn_articles_analysis', 'wb') as f:
    pickle.dump(article_analysis, f)

# Make df of all tweets, linked to the article

In [48]:
twitter_article_analysis=full_join.copy()

In [49]:
twitter_article_analysis.drop('shortURL', axis=1, inplace=True)

In [50]:
twitter_article_analysis.drop_duplicates(subset='tweetText', inplace=True)

In [51]:
twitter_article_analysis.reset_index(drop=True, inplace=True)

In [51]:
twitter_article_analysis

Unnamed: 0,expURL,category,headline,artText,date,tweetText,replies,retweets,likes
0,https://www.cnn.com/2020/07/02/europe/russia-v...,europe,Vladimir Putin could rule Russia until 2036. H...,(CNN)Russian President Vladimir Putin is alre...,2020-07-02,Russian President Vladimir Putin is already on...,375,239,744
1,https://www.facebook.com/AndersonCooperFullCir...,blank,Anderson Cooper Full Circle - Meet the top 10 ...,,2018-11-22,Happy Thanksgiving! Anderson introduces this y...,43,38,162
2,https://www.cnn.com/2018/08/03/us/detroit-offi...,us,Detroit officer suspended after video shows hi...,(CNN)A Detroit police officer has been suspen...,2018-08-03,A Detroit police officer has been suspended af...,63,125,188
3,https://www.cnn.com/2021/09/18/us/coronavirus-...,us,Lions and tigers at DC's National Zoo test pre...,(CNN)Animal keepers at the Smithsonian's Nati...,2021-09-18,Animal keepers at Smithsonian's National Zoo i...,151,160,558
4,https://money.cnn.com/2016/08/29/technology/ap...,technology,Apple's next iPhone will likely be unveiled Se...,,2016-08-29,Apple is expected to announce the latest iPhon...,36,423,459
...,...,...,...,...,...,...,...,...,...
306035,http://www.cnn.com/2017/05/11/health/female-ge...,health,The alarming rise of female genital mutilation...,(CNN)It's a brutal practice that's inflicted ...,2017-05-11T08:00:59Z,The alarming rise of female genital mutilation...,79,144,130
306036,https://www.cnn.com/2020/03/09/health/cpr-livi...,health,"CPR has improved, here's what to do (and sing)...",(CNN)It only takes two hands to save a life. ...,2020-03-09T08:34:14Z,"CPR has improved, here's what to do (and sing)...",18,193,484
306037,http://www.cnn.com/2016/12/23/politics/us-isra...,politics,What the UN vote says about how Trump will tre...,Washington (CNN)It's Donald Trump and Benjamin...,2016-12-24T00:15:56Z,What the UN vote says about how Trump will tre...,117,102,170
306038,http://www.cnn.com/2017/05/11/us/new-orleans-c...,us,New Orleans begins to remove second Confederat...,"(CNN)As police stood between opposing crowds,...",2017-05-11T08:58:33Z,New Orleans begins to remove second Confederat...,95,116,433


In [52]:
for tw in ['replies', 'retweets', 'likes']:
    twitter_article_analysis[tw]=twitter_article_analysis[tw].astype(int)

In [53]:
with open('../data/cnn_twitter_articles_analysis', 'wb') as f:
    pickle.dump(twitter_article_analysis, f)