In [37]:
import json
!pip install -q -U wordcloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import defaultdict

In [38]:
hashtag_counter = defaultdict(lambda:0)
for line in open("Tweet-Data/digital_twin_tweets-2020.jl"):
    try:
        doc = json.loads(line)
        tweets = doc['response']['data']
        for tweet in tweets:
            hashtags = [ht['tag'] for ht in tweet['entities']['hashtags']]
            for hashtag in hashtags:
                hashtag_counter[hashtag.lower()] += 1
    except:
        pass # do nothing

In [None]:
wordcloud = WordCloud(max_font_size=300,repeat=False,width=1200,height=800,max_words=1000).generate_from_frequencies(hashtag_counter)
plt.figure(figsize=(20,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()



<h1>Data Exploring</h1>

In [90]:
import pandas as pd
import json
data = open("Tweet-Data/digital_twin_tweets-2020.jl")


In [49]:
type(data)


_io.TextIOWrapper

In [39]:
for line in data:
    doc = json.loads(line)

In [41]:
type(doc)

dict

In [59]:
doc.keys()


dict_keys(['http_status', 'year', 'params', 'response'])

In [81]:
doc['http_status']

200

In [80]:
doc['year']

'2020'

In [83]:
doc['params']


[['query',
  '#digitaltwin OR #digital_twin OR "digital twin" OR "digital twins" OR #digital_twins OR #digitaltwins'],
 ['max_results', '500'],
 ['start_time', '2020-01-01T00:00:00Z'],
 ['end_time', '2020-12-31T11:59:59Z'],
 ['tweet.fields',
  'attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,possibly_sensitive,public_metrics,referenced_tweets,reply_settings,source,text,withheld'],
 ['expansions',
  'author_id,referenced_tweets.id,referenced_tweets.id.author_id,entities.mentions.username,attachments.poll_ids,attachments.media_keys,in_reply_to_user_id,geo.place_id'],
 ['media.fields',
  'duration_ms,height,media_key,preview_image_url,public_metrics,type,url,width'],
 ['place.fields',
  'contained_within,country,country_code,full_name,geo,id,name,place_type'],
 ['poll.fields', 'duration_minutes,end_datetime,id,options,voting_status'],
 ['user.fields',
  'created_at,description,entities,id,location,name,pinned_tweet_id,profile_i

<h1>Response of Request</h1>

In [82]:
response = doc['response']
response.keys()

dict_keys(['data', 'includes', 'errors', 'meta'])

In [84]:
response['errors']

[{'detail': 'Could not find tweet with referenced_tweets.id: [1212572346998874112].',
  'title': 'Not Found Error',
  'resource_type': 'tweet',
  'parameter': 'referenced_tweets.id',
  'value': '1212572346998874112',
  'type': 'https://api.twitter.com/2/problems/resource-not-found'},
 {'detail': 'Could not find tweet with referenced_tweets.id: [1212558559013036034].',
  'title': 'Not Found Error',
  'resource_type': 'tweet',
  'parameter': 'referenced_tweets.id',
  'value': '1212558559013036034',
  'type': 'https://api.twitter.com/2/problems/resource-not-found'},
 {'detail': 'Could not find tweet with referenced_tweets.id: [1212548159970275330].',
  'title': 'Not Found Error',
  'resource_type': 'tweet',
  'parameter': 'referenced_tweets.id',
  'value': '1212548159970275330',
  'type': 'https://api.twitter.com/2/problems/resource-not-found'},
 {'parameter': 'entities.mentions.username',
  'value': '3DSAcademy',
  'detail': 'User has been suspended: [3DSAcademy].',
  'title': 'Forbidden

In [115]:
response['meta']

{'newest_id': '1212738038679449606',
 'oldest_id': '1212163451440660481',
 'result_count': 290}

<h2>Includes of tweets</h2>

In [89]:
response['includes'].keys()

dict_keys(['media', 'users', 'tweets', 'places'])

In [102]:
type(response['includes']['media'])

list

In [113]:
pd.DataFrame(response['includes']['places'])


Unnamed: 0,country_code,place_type,country,name,geo,full_name,id
0,JP,city,Japan,世田谷区,"{'type': 'Feature', 'bbox': [139.582428, 35.59...",東京 世田谷区,1b0cfebe3424e14f
1,AT,city,Österreich,Linz,"{'type': 'Feature', 'bbox': [14.245709, 48.211...","Linz, Österreich",0036224b2a183fcf


In [112]:
pd.DataFrame(response['includes']['users']).head()


Unnamed: 0,username,public_metrics,entities,url,protected,location,profile_image_url,created_at,name,verified,description,id,pinned_tweet_id
0,prolimglobal,"{'followers_count': 227, 'following_count': 18...","{'url': {'urls': [{'start': 0, 'end': 20, 'url...",http://t.co/QtJHdWgC,False,"Farmington Hills, MI",https://pbs.twimg.com/profile_images/106961374...,2012-09-05T13:23:40.000Z,PROLIM Corporation,False,"IT Consulting, PLM, Engineering Services.",804599257,
1,Windpower_Eng,"{'followers_count': 20065, 'following_count': ...","{'url': {'urls': [{'start': 0, 'end': 22, 'url...",http://t.co/vBCewbsVbl,False,"Cleveland, Ohio",https://pbs.twimg.com/profile_images/884754224...,2009-05-27T14:44:38.000Z,Windpower Engineering & Development,False,Industry Experts covering everything in wind e...,42896131,
2,fogle_shane,"{'followers_count': 19315, 'following_count': ...","{'url': {'urls': [{'start': 0, 'end': 23, 'url...",https://t.co/c9031YDqdt,False,United States,https://pbs.twimg.com/profile_images/135935176...,2010-11-30T13:46:14.000Z,Shane Fogle,False,Security Operations Director at Protiviti I Pa...,221370308,1.3636458616836915e+18
3,PVynckier,"{'followers_count': 20508, 'following_count': ...",,,False,Strasbourg,https://pbs.twimg.com/profile_images/761118751...,2015-06-19T12:28:41.000Z,Philippe Vynckier - Influencer,False,Interested in the field of information technol...,3334677605,
4,mclynd,"{'followers_count': 40221, 'following_count': ...","{'url': {'urls': [{'start': 0, 'end': 23, 'url...",https://t.co/SyKSshSvqI,False,United States,https://pbs.twimg.com/profile_images/135046612...,2008-11-23T17:05:54.000Z,Mark Lynd 🎙CISSP ISSMP ISSMP,False,"#CISO #CIO #USArmyVeteran, Father of 3 Girls, ...",17575069,1.3642098708918723e+18


In [111]:
pd.DataFrame(response['includes']['media']).head()

Unnamed: 0,type,width,height,url,media_key,public_metrics,preview_image_url,duration_ms
0,photo,1114,627,https://pbs.twimg.com/media/ENSC0bLXkAABYx1.jpg,3_1212738036829753344,,,
1,photo,268,170,https://pbs.twimg.com/media/ENSCnh5X0AABiYr.jpg,3_1212737815295021056,,,
2,photo,1301,732,https://pbs.twimg.com/media/ENSAoOXWoAMC3Yw.jpg,3_1212735628208676867,,,
3,photo,1024,768,https://pbs.twimg.com/media/ENRzDHjWwAAhknF.jpg,3_1212720697073647616,,,
4,photo,1024,768,https://pbs.twimg.com/media/ENRzAhwXUAEncA-.jpg,3_1212720652567924737,,,


In [114]:
pd.DataFrame(response['includes']['tweets']).head()

Unnamed: 0,entities,context_annotations,created_at,source,text,id,possibly_sensitive,public_metrics,author_id,conversation_id,reply_settings,lang,attachments,referenced_tweets,in_reply_to_user_id,geo
0,"{'mentions': [{'start': 95, 'end': 102, 'usern...","[{'domain': {'id': '65', 'name': 'Interests an...",2020-01-02T06:03:47.000Z,Twitter Web Client,"How AR, VR and digital twins optimize the indu...",1212615451005595648,False,"{'retweet_count': 6, 'reply_count': 0, 'like_c...",3334677605,1212615451005595648,everyone,en,,,,
1,"{'mentions': [{'start': 99, 'end': 113, 'usern...","[{'domain': {'id': '65', 'name': 'Interests an...",2020-01-02T12:21:59.000Z,Twitter Web App,AI geologists? The end of maps? Geological dat...,1212710626914717696,False,"{'retweet_count': 8, 'reply_count': 8, 'like_c...",235264546,1212710626914717696,everyone,en,{'media_keys': ['3_1212710617724997633']},,,
2,"{'hashtags': [{'start': 100, 'end': 112, 'tag'...",,2019-12-29T08:04:26.000Z,Twitter for iPhone,Get digital: Digital Twin alleviates construct...,1211196260507672576,False,"{'retweet_count': 5, 'reply_count': 0, 'like_c...",1560373568,1211196260507672576,everyone,en,,"[{'type': 'quoted', 'id': '1207334728359194625'}]",,
3,"{'mentions': [{'start': 63, 'end': 74, 'userna...","[{'domain': {'id': '46', 'name': 'Brand Catego...",2019-12-31T16:10:27.000Z,Sprinklr,2019 Best of the #QuickBytes show: #IBM Distin...,1212043348216823808,False,"{'retweet_count': 7, 'reply_count': 0, 'like_c...",1953648950,1212043348216823808,everyone,en,{'media_keys': ['3_1212043346681716740']},,,
4,"{'urls': [{'start': 172, 'end': 195, 'url': 'h...",,2020-01-02T08:40:52.000Z,Hootsuite Inc.,Das ganze ada-Team wünscht euch einen fabelhaf...,1212654980882022400,False,"{'retweet_count': 3, 'reply_count': 2, 'like_c...",1009012670790623232,1212654980882022400,everyone,de,{'media_keys': ['16_1212654979628056576']},,,


<h2>Data of tweets</h2>

In [67]:
daten = response['data']
type(daten)

list

In [109]:
df = pd.DataFrame(daten)
df.head()

Unnamed: 0,attachments,created_at,source,text,id,possibly_sensitive,public_metrics,entities,author_id,conversation_id,reply_settings,lang,referenced_tweets,context_annotations,in_reply_to_user_id,geo
0,{'media_keys': ['3_1212738036829753344']},2020-01-02T14:10:54.000Z,Zift Platform,The #digitaltwin shortens program schedules an...,1212738038679449606,False,"{'retweet_count': 0, 'reply_count': 0, 'like_c...","{'hashtags': [{'start': 4, 'end': 16, 'tag': '...",804599257,1212738038679449606,everyone,en,,,,
1,{'media_keys': ['3_1212737815295021056']},2020-01-02T14:10:01.000Z,Sprout Social,How digital twins are transforming wind operat...,1212737817262116867,False,"{'retweet_count': 1, 'reply_count': 0, 'like_c...","{'urls': [{'start': 53, 'end': 76, 'url': 'htt...",42896131,1212737817262116867,everyone,en,,,,
2,,2020-01-02T14:08:45.000Z,Twitter for iPhone,"RT @PVynckier: How AR, VR and digital twins op...",1212737498411098116,False,"{'retweet_count': 6, 'reply_count': 0, 'like_c...","{'mentions': [{'start': 3, 'end': 13, 'usernam...",221370308,1212737498411098116,everyone,en,"[{'type': 'retweeted', 'id': '1212615451005595...","[{'domain': {'id': '65', 'name': 'Interests an...",,
3,,2020-01-02T14:03:38.000Z,Twitter for Android,RT @CliveBGS: AI geologists? The end of maps? ...,1212736208612278277,False,"{'retweet_count': 8, 'reply_count': 0, 'like_c...","{'mentions': [{'start': 3, 'end': 12, 'usernam...",544295578,1212736208612278277,everyone,en,"[{'type': 'retweeted', 'id': '1212710626914717...","[{'domain': {'id': '65', 'name': 'Interests an...",,
4,{'media_keys': ['3_1212735628208676867']},2020-01-02T14:01:20.000Z,The Social Jukebox,Accelerating Product Development: Digital Twin...,1212735630247124994,False,"{'retweet_count': 1, 'reply_count': 0, 'like_c...","{'hashtags': [{'start': 157, 'end': 179, 'tag'...",476443251,1212735630247124994,everyone,en,,"[{'domain': {'id': '65', 'name': 'Interests an...",,


In [5]:
for line in data:
    try:
        doc = json.loads(line)
        tweets = doc['response']['data']
        for tweet in tweets:
            print(tweet['source'])
    except:
        pass

In [69]:
df.columns

Index(['attachments', 'created_at', 'source', 'text', 'id',
       'possibly_sensitive', 'public_metrics', 'entities', 'author_id',
       'conversation_id', 'reply_settings', 'lang', 'referenced_tweets',
       'context_annotations', 'in_reply_to_user_id', 'geo'],
      dtype='object')