# Extracting Video History from json

This notebook is used to extract the Video Browsing History from a user_data.json file downloaded from TikTok.

Author: Audrey Yip & Jyontika Kapoor

Date: 03-04-2024

In [1]:
# imports
import random
import os
import json
import csv
from datetime import datetime

In [2]:
#team members put path to raw data here 
filename = '/Users/jyontika/Desktop/user_data_jyontika.json' 

with open(filename, 'r') as myFile:    
  data = json.load(myFile)

In [3]:
video_URLs = data['Activity']['Video Browsing History']

print("Number of Videos:", len(video_URLs))

Number of Videos: 1


In [4]:
# create anonymized json file, per Eni's instructions
random_number = str(random.randint(10000, 99999))
path_raw = "url-json-raw/"
filename = "Sec2Gr3_" + random_number + ".json"
outfile_path_raw =  os.path.join(path_raw, filename) 

with open(outfile_path_raw, 'w') as outfile:
    json.dump(video_URLs, outfile)

print("Video Browsing Data has been dumped into", outfile_path_raw)

Video Browsing Data has been dumped into url-json-raw/Sec2Gr3_31004.json


### Extract Dates related to world news 

In [5]:
#create a function to filter dates

def filter_dates(start_date, end_date, data):
    filtered_data = []
    for entry in data['VideoList']:
        date_str = entry['Date']
        date_obj = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S")
        if start_date <= date_obj <= end_date:
            filtered_data.append(entry)
    return {"VideoList": filtered_data}


In [6]:
#oct 7 and 2 months after
start_date = datetime(2023, 10, 7)
end_date = datetime(2023, 12, 7)

with open(outfile_path_raw, 'r') as file:
    json_data = json.load(file)

# Filter the data based on the date range
filtered_data = filter_dates(start_date, end_date, json_data)


In [7]:
# save the filtered data to a new JSON file
filtered_path = "url-json-oct7/"
filename = "Sec2Gr3_" + random_number + ".json"
outfile_path_filtered =  os.path.join(filtered_path, filename) 

with open(outfile_path_filtered, 'w') as file:
    json.dump(filtered_data['VideoList'], file, indent=2)

print(f"Filtered data saved to {outfile_path_filtered}")

Filtered data saved to url-json-oct7/Sec2Gr3_31004.json


### parking lot for code

In [8]:
#convert video URLs to a csv -- ONLY with link

# video_urls = [entry['Link'] for entry in video_URLs['VideoList']]

# path = "url-csv/"
# filename  = 'video_urls_' + random_number + '.csv' 
# csv_file_path = os.path.join(path, filename) 

# # write the URLs into a csv
# with open(csv_file_path, 'w', newline='') as csv_file:
#     csv_writer = csv.writer(csv_file)
#     csv_writer.writerow(['Video URL'])  # Write header

#     for url in video_urls:
#         csv_writer.writerow([url])

# print("Video URLs have been dumped into", csv_file_path)

In [9]:
# # this chunk of code converted video URLs to CSV with dates

# video_data = video_URLs['VideoList']
# video_urls = [entry['Link'] for entry in video_data]
# video_dates = [entry['Date'] for entry in video_data]

# # Combine links and dates into pairs
# video_info = zip(video_urls, video_dates)

# # Specify the CSV file path
# path = "url-csv/"
# filename = f'video_urls_{random_number}.csv'
# csv_file_path = os.path.join(path, filename)

# # write the URLs and dates into a csv
# with open(csv_file_path, 'w', newline='') as csv_file:
#     csv_writer = csv.writer(csv_file)
#     csv_writer.writerow(['Video URL', 'Date']) 
    
#     for url, date in video_info:
#         csv_writer.writerow([url, date])

# print("Video URLs and Dates have been dumped into", csv_file_path)


### Extract following list

Please:
1) Run the first two cells of this notebook, and then these cells
2) Copy and paste this list into an email and send it to me, with the name f'acc_{random_number}'
3) If your account doesn't have a random number (i.e. it doesn't have video browsing history so you didn't run it above, please just name it 'acc_{your_initials}{counter}')

In [11]:
import flatdict
from collections import defaultdict

In [12]:
print("Number of Following:", len(data['Activity']['Following List']['Following'])) # print number of followers

d =  flatdict.FlatDict(data['Activity']['Following List'], delimiter='.')['Following'] #flattens dict; gives list
res = defaultdict(list)
{res[key].append(sub[key]) for sub in d for key in sub} # makes one dict from list of dicts

follow_list = res['UserName'] # list of user's followers
follow_list

Number of Following: 162


['lochan.k',
 'r.ohini',
 '20amuller',
 'thefurnituredoctor',
 'ryanisreallypolite',
 'cnovello13',
 'millymillyrockz',
 'lilchikiszat',
 'karinaaarose',
 'misssmaiah',
 'alexamichela',
 'urvikap97',
 'ayanna_moise',
 'nasirwynruit',
 'jvmes.music',
 'peaceluvcourt',
 'ruiepooie',
 'xo.zen',
 'eeshkapeesh999',
 'superkeara',
 'selenagomez',
 'karissalauren',
 'phillyfoodies',
 'itzelt.reyes',
 'shruthisundar01',
 'imogenieinabottle',
 'ripberniemadoff',
 'sslizzle',
 'alex.ndratx',
 'srimyla',
 'alakeyeah',
 'sachitanwar6',
 'thewellesleynews',
 'gshell08',
 'oliviaholtzinger10',
 'mkennedy28',
 'box_of_olives',
 'cosyeet',
 'sophferrante',
 'jassaco_xo',
 'zoralikedora',
 'planetmargs',
 'erictran42',
 'bhadminton',
 'whiteboyofthemonth69',
 'papayasguy',
 'kenzie461',
 'sophdog03',
 'ellie.bk',
 'evanpalmblad',
 'rosedg18',
 'averyhirschofficial',
 'nationltreasure',
 'liesel.liesel',
 'ash.reeeee',
 'sarahcatherinne_',
 'shirtsbylena',
 'igobyadi',
 'jamsino',
 'krithi.com',
 'uhave

In [14]:
file_path = '/Users/jyontika/Desktop/follow_data.csv'

# Write the follow_list to the CSV file
with open(file_path, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    for item in follow_list:
        csv_writer.writerow([item])

print("CSV file has been created successfully.")

CSV file has been created successfully.
