In [2]:
import pandas as pd
import tweepy
from pprint import pprint

# Index

* [Part 1](#Part-1)
    * [Using (old) tweet extraction way for EDA](#Using-(old)-tweet-extraction-way-for-EDA)
    * [Using Cursor](#Using-Cursor)
* [Part 2](#Part-2)

**Part 1:** You have to write a python script which can fetch all the tweets(as many as allowed by Twitter
API) done by `@midasIIITD` twitter handle and dump the responses into JSONlines file.  

**Part 2:** The other part of your script should be able to parse these JSONline files to display the
following for every tweet in a tabular format.
* The text of the tweet.  
* Date and time of the tweet.  
* The number of favorites/likes.  
* The number of retweets.  
* Number of Images present in Tweet. If no image returns None.

# Authorization

In [4]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

# Part 1

## Using (old) tweet extraction way for EDA

Using the old way for tweet extraction of data exploration

In [7]:
# max count of tweets: 200
tweets = api.user_timeline(screen_name='midasIIITD')

In [37]:
len(tweets)

20

In [45]:
pprint(tweets[0])

Status(_api=<tweepy.api.API object at 0x7f7607385c18>, _json={'created_at': 'Sun Apr 07 06:55:19 +0000 2019', 'id': 1114783695129534464, 'id_str': '1114783695129534464', 'text': 'Other queries: "none of the Tweeter Apis give the correct count of favorites tested for most of them, all give the… https://t.co/2jnCTMMqV8', 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/2jnCTMMqV8', 'expanded_url': 'https://twitter.com/i/web/status/1114783695129534464', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [116, 139]}]}, 'source': '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 1021355762575073281, 'id_str': '1021355762575073281', 'name': 'MIDAS IIITD', 'screen_name': 'midasIIITD', 'location': 'New Delhi, India', 'description'

## [Tweet objects](https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object)

### 1. text of tweet

In [16]:
pprint(tweets[0].text)

('Other queries: "none of the Tweeter Apis give the correct count of favorites '
 'tested for most of them, all give the… https://t.co/2jnCTMMqV8')


### 2. data and time of tweet (UTC time)

In [44]:
str(tweets[0].created_at)

'2019-04-07 06:55:19'

### 3. The number of favorites/likes.

In [47]:
tweets[0].favorite_count

0

### 4. The number of retweets

In [48]:
tweets[0].retweet_count

1

### 5. Number of [Images](https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/entities-object) present in Tweet. If no image returns None

In [51]:
# media appears only if available
tweets[0].entities

{'hashtags': [],
 'symbols': [],
 'user_mentions': [],
 'urls': [{'url': 'https://t.co/2jnCTMMqV8',
   'expanded_url': 'https://twitter.com/i/web/status/1114783695129534464',
   'display_url': 'twitter.com/i/web/status/1…',
   'indices': [116, 139]}]}

In [55]:
image_count = 0
try: 
    if tweets[0].entities['media'][0]['type'] == 'photo':
        image_count += 1
except:
    image_count = None
pprint(image_count)

None


## Using [`Cursor`](http://docs.tweepy.org/en/3.7.0/cursor_tutorial.html)

Using `Cursor` object instead of the old way for ease.

***Note:*** *Using `tweet_mode=’extended’` to Cursor object call to include tweets > 140 characters. Replace `.text` attribute with `.full_text`.*

In [27]:
# create a temporary empty list
temp = []

In [29]:
# use Cursor object to get tweets and append it to the list created above
for status in tweepy.Cursor(api.user_timeline, screen_name='@midasIIITD', tweet_mode="extended").items():
    temp.append(status)

In [30]:
# no of tweets
len(temp)

333

In [59]:
# data of first tweet
temp[0]._json

{'created_at': 'Sun Apr 07 06:55:19 +0000 2019',
 'id': 1114783695129534464,
 'id_str': '1114783695129534464',
 'full_text': 'Other queries: "none of the Tweeter Apis give the correct count of favorites tested for most of them, all give the wrong count. same is true for retweet. this mostly happens if the no. of likes, retweet is very large. So, what shld be done?"\nAns: Just use the count given by API.',
 'truncated': False,
 'display_text_range': [0, 279],
 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': []},
 'source': '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>',
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'user': {'id': 1021355762575073281,
  'id_str': '1021355762575073281',
  'name': 'MIDAS IIITD',
  'screen_name': 'midasIIITD',
  'location': 'New Delhi, India',
  'description': 'MIDAS is a group of researchers

In [61]:
# using `full_text` attribute rather than `text
temp[0]._json['full_text']

'Other queries: "none of the Tweeter Apis give the correct count of favorites tested for most of them, all give the wrong count. same is true for retweet. this mostly happens if the no. of likes, retweet is very large. So, what shld be done?"\nAns: Just use the count given by API.'

### Save into JSON file

In [63]:
with open('tweet.json', 'w', encoding='utf8') as file:
    for i in temp:
        json.dump(i._json, file, sort_keys=True, indent=4)

### Save data into `jsonlines` file

In [64]:
# install jsonlines library for ease
!pip install jsonlines

Collecting jsonlines
  Downloading https://files.pythonhosted.org/packages/4f/9a/ab96291470e305504aa4b7a2e0ec132e930da89eb3ca7a82fbe03167c131/jsonlines-1.2.0-py2.py3-none-any.whl
Installing collected packages: jsonlines
Successfully installed jsonlines-1.2.0


In [69]:
import jsonlines
with jsonlines.open('tweets.jsonl', mode='w', sort_keys=True) as writer:
    for i in temp:
        writer.write(i._json)

# Part 2

Parse `jsonlines` file to get the required data

In [75]:
# read the jsonlines file and get data into a python list
total_tweets = []
with jsonlines.open('tweets.jsonl') as reader:
    for obj in reader:
        total_tweets.append(obj)

In [76]:
# compare data of first tweet to confirm everything working as intended
total_tweets[0]

{'contributors': None,
 'coordinates': None,
 'created_at': 'Sun Apr 07 06:55:19 +0000 2019',
 'display_text_range': [0, 279],
 'entities': {'hashtags': [], 'symbols': [], 'urls': [], 'user_mentions': []},
 'favorite_count': 2,
 'favorited': False,
 'full_text': 'Other queries: "none of the Tweeter Apis give the correct count of favorites tested for most of them, all give the wrong count. same is true for retweet. this mostly happens if the no. of likes, retweet is very large. So, what shld be done?"\nAns: Just use the count given by API.',
 'geo': None,
 'id': 1114783695129534464,
 'id_str': '1114783695129534464',
 'in_reply_to_screen_name': None,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'is_quote_status': False,
 'lang': 'en',
 'place': None,
 'retweet_count': 1,
 'retweeted': False,
 'source': '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>',
 'truncated': False,
 'user': 

In [95]:
# create an empty dictionary of lists, which will hold tweets' information sequentially
tweets_info = {'tweettext': [],
               'datetime': [],
               'favorite_count': [],
               'retweet_count': [],
               'image_count': []
              }

In [96]:
# iterate through tweets and append the required tweet info into the lists of above created dictionary
for tweet in total_tweets:
    tweets_info['tweettext'].append(tweet['full_text'])
    tweets_info['datetime'].append(str(tweet['created_at']))
    tweets_info['favorite_count'].append(tweet['favorite_count'])
    tweets_info['retweet_count'].append(tweet['retweet_count'])
    image_count = 0
    try: 
        if tweet['entities']['media'][0]['type'] == 'photo':
            image_count += 1
    except:
        image_count = 0
    tweets_info['image_count'].append(image_count)

In [97]:
# convert dictionary into pandas dataframe
pd.DataFrame(tweets_info)

Unnamed: 0,tweettext,datetime,favorite_count,retweet_count,image_count
0,"Other queries: ""none of the Tweeter Apis give ...",Sun Apr 07 06:55:19 +0000 2019,2,1,0
1,"Other queries: ""do we have to make two differe...",Sun Apr 07 06:53:38 +0000 2019,2,0,0
2,"Other queries: ""If using Twitter api, it does ...",Sun Apr 07 05:32:27 +0000 2019,4,1,0
3,Response to some queries asked by students on ...,Sun Apr 07 05:29:40 +0000 2019,6,1,0
4,RT @kdnuggets: Top 8 #Free Must-Read #Books on...,Sat Apr 06 17:11:29 +0000 2019,0,2,0
5,@nupur_baghel @PennDATS Congratulation @nupur_...,Sat Apr 06 16:43:27 +0000 2019,14,3,1
6,We have emailed the task details to all candid...,Fri Apr 05 16:08:37 +0000 2019,10,1,0
7,RT @rfpvjr: Our NAACL paper on polarization in...,Fri Apr 05 04:05:11 +0000 2019,0,16,0
8,RT @kdnuggets: Effective Transfer Learning For...,Fri Apr 05 04:04:43 +0000 2019,0,10,1
9,RT @stanfordnlp: What’s new in @Stanford CS224...,Wed Apr 03 18:31:53 +0000 2019,0,58,0
