In [6]:
import json
import pandas as pd
from datetime import datetime

# Load the JSON data from file
with open('data/twitter-1mb.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Convert the 'rows' list to a DataFrame
tweets_data = data['rows']
df = pd.DataFrame(data)



In [11]:
data

{'total_rows': 185875709,
 'offset': 0,
 'rows': [{'id': '1406813874750300166',
   'key': [2021,
    6,
    21,
    '1405853521107324933',
    '1055330142313046016',
    '1406813874750300166'],
   'value': {'text': 'Such a cute pupper 😍'},
   'doc': {'_id': '1406813874750300166',
    '_rev': '1-1a7924e962def1c92ecd08ee1537ea1b',
    'data': {'author_id': '1055330142313046016',
     'conversation_id': '1405853521107324933',
     'created_at': '2021-06-21T03:18:59.000Z',
     'entities': {'mentions': [{'start': 0,
        'end': 13,
        'username': 'pup_chazable'}]},
     'geo': {},
     'lang': 'en',
     'public_metrics': {'retweet_count': 0,
      'reply_count': 0,
      'like_count': 0,
      'quote_count': 0},
     'text': '@pup_chazable Such a cute pupper 😍',
     'sentiment': 0.7142857142857143},
    'matching_rules': [{'id': 1406789634793697300,
      'tag': 'Australia-based users or Australia-located tweets, but no re-tweets'}]}},
  {'id': '1406814402272137216',
   'key': [2

In [None]:
# Ensure 'created_at' is a datetime object and create new columns for analysis
df['created_at'] = pd.to_datetime(df['created_at'])
df['hour'] = df['created_at'].dt.hour
df['date'] = df['created_at'].dt.date

# Group by hour to find the happiest and most active hour
hourly_sentiment = df.groupby('hour')['sentiment'].mean().idxmax()
hourly_activity = df.groupby('hour').size().idxmax()

# Group by date to find the happiest and most active day
daily_sentiment = df.groupby('date')['sentiment'].mean().idxmax()
daily_activity = df.groupby('date').size().idxmax()

print(f"The happiest hour ever: {hourly_sentiment}:00")
print(f"The most active hour ever: {hourly_activity}:00")
print(f"The happiest day ever: {daily_sentiment}")
print(f"The most active day ever: {daily_activity}")


In [52]:
import json
from collections import defaultdict
from dateutil import parser
import time

# start = time.localtime()
# Load the JSON data
with open('data/twitter-50mb.json') as f:
    data = json.load(f)

# Extract relevant fields and calculate sentiment scores
tweets = []
for tweet in data['rows']:
    # if 'value' not in tweet:
    #     continue
    if 'doc' not in tweet:
        continue
    if 'sentiment' not in tweet['doc']['data']:
        continue
    # created_at = datetime.fromisoformat(tweet['doc']['data']['created_at'])
    created_at = parser.parse(tweet['doc']['data']['created_at'])
    sentiment_score = tweet['doc']['data']['sentiment']
    if isinstance(sentiment_score, dict):
        sentiment_score = sentiment_score['score']
    # print(sentiment_score)
    tweets.append({'created_at': created_at, 'sentiment_score': sentiment_score})

tweets_df = pd.DataFrame(tweets)
# Convert the created_at column to datetime
tweets_df['created_at'] = pd.to_datetime(tweets_df['created_at'])

# Extract the hour and day from the created_at column
tweets_df['hour'] = tweets_df['created_at'].dt.hour
tweets_df['day'] = tweets_df['created_at'].dt.date

happiest_hour = tweets_df.groupby('hour')['sentiment_score'].sum().idxmax()
happiest_day = tweets_df.groupby('day')['sentiment_score'].sum().idxmax()
most_active_hour = tweets_df['hour'].value_counts().idxmax()
most_active_day = tweets_df['day'].value_counts().idxmax()

# end = time.localtime()

# print(end - start)
print(f"Happiest hour: {happiest_hour}")
print(f"Happiest day: {happiest_day}")
print(f"Most active hour: {most_active_hour}")
print(f"Most active day: {most_active_day}")



Happiest hour: 2
Happiest day: 2021-06-21
Most active hour: 1
Most active day: 2021-06-21


In [23]:
tweets

[{'created_at': datetime.datetime(2021, 6, 21, 3, 18, 59, tzinfo=tzutc()),
  'sentiment_score': 0.7142857142857143},
 {'created_at': datetime.datetime(2021, 6, 21, 3, 21, 5, tzinfo=tzutc()),
  'sentiment_score': 0.36363636363636365},
 {'created_at': datetime.datetime(2021, 6, 21, 10, 23, 48, tzinfo=tzutc()),
  'sentiment_score': 0},
 {'created_at': datetime.datetime(2021, 6, 21, 10, 56, 40, tzinfo=tzutc()),
  'sentiment_score': 0.25},
 {'created_at': datetime.datetime(2021, 6, 21, 11, 9, 17, tzinfo=tzutc()),
  'sentiment_score': 0},
 {'created_at': datetime.datetime(2021, 6, 21, 23, 36, 59, tzinfo=tzutc()),
  'sentiment_score': -0.09523809523809523},
 {'created_at': datetime.datetime(2021, 6, 21, 4, 14, 46, tzinfo=tzutc()),
  'sentiment_score': 0.3},
 {'created_at': datetime.datetime(2021, 6, 21, 0, 46, 21, tzinfo=tzutc()),
  'sentiment_score': 0.09090909090909091},
 {'created_at': datetime.datetime(2021, 6, 21, 14, 2, 21, tzinfo=tzutc()),
  'sentiment_score': 0.08333333333333333},
 {'

In [39]:
# Group tweets by hour and day
tweets_by_hour = defaultdict(list)
tweets_by_day = defaultdict(list)
for tweet in tweets:
    hour = tweet['created_at'].hour
    day = tweet['created_at'].date()
    tweets_by_hour[hour].append(tweet)
    tweets_by_day[day].append(tweet)


happiest_hour = max(tweets_by_hour.items(), key=lambda x: sum(tweet['sentiment_score'] for tweet in x[1]))
happiest_day = max(tweets_by_day.items(), key=lambda x: sum(tweet['sentiment_score'] for tweet in x[1]))
most_active_hour = max(tweets_by_hour.items(), key=lambda x: len(x[1]))
most_active_day = max(tweets_by_day.items(), key=lambda x: len(x[1]))


print(f"Happiest hour: {happiest_hour[0]}")
print(f"Happiest day: {happiest_day[0]}")
print(f"Most active hour: {most_active_hour[0]}")
print(f"Most active day: {most_active_day[0]}")

Happiest hour: 5
Happiest day: 2021-06-21
Most active hour: 2
Most active day: 2021-06-21


In [60]:
import json
import pandas as pd

# Load the JSON data
with open('data/twitter-1mb.json') as f:
    data = json.load(f)

# Create a list to store rows with 'doc' key
rows_with_doc = []

# Filter out rows without 'doc' key
for row in data['rows']:
    if 'doc' in row:
        rows_with_doc.append(row)

# Create a DataFrame from the rows with 'doc' key
tweets_df = pd.json_normalize(rows_with_doc)



TypeError: {'id': '1406813874750300166', 'key': [2021, 6, 21, '1405853521107324933', '1055330142313046016', '1406813874750300166'], 'value': {'text': 'Such a cute pupper 😍'}, 'doc': {'_id': '1406813874750300166', '_rev': '1-1a7924e962def1c92ecd08ee1537ea1b', 'data': {'author_id': '1055330142313046016', 'conversation_id': '1405853521107324933', 'created_at': '2021-06-21T03:18:59.000Z', 'entities': {'mentions': [{'start': 0, 'end': 13, 'username': 'pup_chazable'}]}, 'geo': {}, 'lang': 'en', 'public_metrics': {'retweet_count': 0, 'reply_count': 0, 'like_count': 0, 'quote_count': 0}, 'text': '@pup_chazable Such a cute pupper 😍', 'sentiment': 0.7142857142857143}, 'matching_rules': [{'id': 1406789634793697300, 'tag': 'Australia-based users or Australia-located tweets, but no re-tweets'}]}} has non list value {'_id': '1406813874750300166', '_rev': '1-1a7924e962def1c92ecd08ee1537ea1b', 'data': {'author_id': '1055330142313046016', 'conversation_id': '1405853521107324933', 'created_at': '2021-06-21T03:18:59.000Z', 'entities': {'mentions': [{'start': 0, 'end': 13, 'username': 'pup_chazable'}]}, 'geo': {}, 'lang': 'en', 'public_metrics': {'retweet_count': 0, 'reply_count': 0, 'like_count': 0, 'quote_count': 0}, 'text': '@pup_chazable Such a cute pupper 😍', 'sentiment': 0.7142857142857143}, 'matching_rules': [{'id': 1406789634793697300, 'tag': 'Australia-based users or Australia-located tweets, but no re-tweets'}]} for path doc. Must be list or null.

In [54]:
tweets_df

Unnamed: 0,id,key,value.text,doc._id,doc._rev,doc.data.author_id,doc.data.conversation_id,doc.data.created_at,doc.data.entities.mentions,doc.data.lang,...,doc.data.entities.annotations,doc.data.sentiment.score,doc.data.sentiment.comparative,doc.data.sentiment.calculation,doc.data.sentiment.tokens,doc.data.sentiment.words,doc.data.sentiment.positive,doc.data.sentiment.negative,doc.data.entities.cashtags,doc.data.entities.hashtags
0,1406813874750300166,"[2021, 6, 21, 1405853521107324933, 10553301423...",Such a cute pupper 😍,1406813874750300166,1-1a7924e962def1c92ecd08ee1537ea1b,1055330142313046016,1405853521107324933,2021-06-21T03:18:59.000Z,"[{'start': 0, 'end': 13, 'username': 'pup_chaz...",en,...,,,,,,,,,,
1,1406814402272137216,"[2021, 6, 21, 1405853521107324933, 14043711962...",Nawww thanks! 🥰 *Nuzzles* right back at ya pup! 😝,1406814402272137216,1-d6ce7e1e33721a9f3aed148b2cc1a64c,1404371196267819012,1405853521107324933,2021-06-21T03:21:05.000Z,"[{'start': 0, 'end': 10, 'username': 'PupLucky...",en,...,,,,,,,,,,
2,1406920780328312832,"[2021, 6, 21, 1405853729555890177, 83195869278...",Just wondered if the peregrines been named yet...,1406920780328312832,1-b5f329c5063489fa0863165e1660f367,831958692786417669,1405853729555890177,2021-06-21T10:23:48.000Z,"[{'start': 0, 'end': 5, 'username': 'WKWT'}]",en,...,,,,,,,,,,
3,1406929052724654084,"[2021, 6, 21, 1405853729555890177, 83195869278...",Thanks. Sorry missed that. Great result though.,1406929052724654084,1-87635ddc02305a887c0d64952ff47929,831958692786417669,1405853729555890177,2021-06-21T10:56:40.000Z,"[{'start': 0, 'end': 5, 'username': 'WKWT'}]",en,...,,,,,,,,,,
4,1406932228165099530,"[2021, 6, 21, 1405853924251226112, 91460102205...",Can you point to some of information about thi...,1406932228165099530,1-eb8055d1705034e0b1ee1d93c4cac920,914601022051336192,1405853924251226112,2021-06-21T11:09:17.000Z,"[{'start': 0, 'end': 16, 'username': 'Akiyoshi...",en,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1406973646577438723,"[2021, 6, 21, 1406057579659153409, 11029104390...",I went there for a weeding a few months ago. T...,1406973646577438723,1-fcc55d377569d9adad6e9ffde3e153b1,1102910439015370752,1406057579659153409,2021-06-21T13:53:52.000Z,"[{'start': 0, 'end': 12, 'username': 'CaroDiRu...",en,...,,,,,,,,,,
996,1406987309690482689,"[2021, 6, 21, 1406058146934706188, 60866631, 1...","Australians don’t want to send her to a , n th...",1406987309690482689,1-5d3d009eae0f57cc9a3efeef64056c01,60866631,1406058146934706188,2021-06-21T14:48:09.000Z,"[{'start': 0, 'end': 5, 'username': 'cnni'}]",en,...,"[{'start': 44, 'end': 52, 'probability': 0.464...",,,,,,,,,
997,1406955275370729480,"[2021, 6, 21, 1406058213787701249, 122395798, ...",So cute!,1406955275370729480,1-5b14e6ffa418a56e5b8f5cdb18ccdaf7,122395798,1406058213787701249,2021-06-21T12:40:52.000Z,"[{'start': 0, 'end': 13, 'username': 'QuietScr...",en,...,,,,,,,,,,
998,1407119087780565000,"[2021, 6, 21, 1406059272568553472, 1404434184,...",Because if they are wrong the s will be all ov...,1407119087780565000,1-4130137cc4fbda9e80277775303c2e0a,1404434184,1406059272568553472,2021-06-21T23:31:48.000Z,"[{'start': 0, 'end': 16, 'username': 'Grassroo...",en,...,"[{'start': 74, 'end': 84, 'probability': 0.758...",,,,,,,,,


In [55]:

tweets_df['created_at'] = pd.to_datetime(tweets_df['created_at'])

tweets_df['hour'] = tweets_df['created_at'].dt.hour
tweets_df['day'] = tweets_df['created_at'].dt.date

happiest_hour = tweets_df.groupby('hour')['sentiment_score'].sum().idxmax()
happiest_day = tweets_df.groupby('day')['sentiment_score'].sum().idxmax()
most_active_hour = tweets_df['hour'].value_counts().idxmax()
most_active_day = tweets_df['day'].value_counts().idxmax()

print(f"Happiest hour: {happiest_hour}")
print(f"Happiest day: {happiest_day}")
print(f"Most active hour: {most_active_hour}")
print(f"Most active day: {most_active_day}")

KeyError: 'created_at'