In [1]:
import matplotlib.pyplot as plt
from tqdm import tqdm
import sweetviz as sv
import pandas as pd
import numpy as np
import json
import os

# Reading the following files

<ul>
    <li>reacted_posts.json: to gather meta data of the posts</li>
    <li>user_post_interaction.json: to get interaction data of users wth missing language info</li>
    <li>user_info_expanded.pkl: to get list of users with missing language info</li>
</ul>

In [2]:
with open('raw/reacted_posts.json') as json_file:
    posts = pd.DataFrame(json.load(json_file))
    
with open('raw/user_post_interaction.json') as json_file:
    log = pd.DataFrame(json.load(json_file))
    
user_profile = pd.read_pickle("user_info_expanded.pkl")


# Shortlisting users with missing language information

In [3]:
na_users = user_profile[user_profile['language'].isnull()].reset_index(drop=True)
na_users


Unnamed: 0,user_id,language
0,1035438645_00614e127baf5032,
1,1035438645_0085cccaab2be238,
2,1035438645_03de497bc5eb8eaa,
3,1035438645_073be95b40a9eda3,
4,1035438645_0746f4306c89636c,
...,...,...
5541,u.browser.for.lite.uc.browser_c01e190db6718708,
5542,u.browser.for.lite.uc.browser_d80d95cde4777ad7,
5543,u.browser.for.lite.uc.browser_e204aa1960a5427b,
5544,u.browser.for.lite.uc.browser_efa957ae19d0a012,


# Filtering interaction data for the shortlisted users

In [4]:
na_user_log = log[log['user_id'].isin(na_users['user_id'].tolist())].reset_index(drop=True)
na_user_log

Unnamed: 0,user_id,reactions
0,1186255453_6b96200b883f6581,[{'post_id': 'native_vren8feb44033d85428330529...
1,864995365_a2b572a31e68cd12,[{'post_id': 'publicvibe_1611854902015326419'}]
2,864995365_9f4b8136228b138e,[{'post_id': 'native_vrhic587c122d164501940a5c...
3,1186255453_cf6c4eca3e1f9e84,[{'post_id': 'twitter_1516340979720024071'}]
4,199639326_31fecdd681abf0ef,[{'post_id': 'news_freepressjournal_2e7fb3b377...
...,...,...
5541,864995365_e92cc1084d1a05f1,[{'post_id': 'instagram_CKtWX67p8fz'}]
5542,864995365_6c0516bf8ccc3bf5,[{'post_id': 'news_filmibeat_f50cbf76ef4cf1f36...
5543,1035438645_e08329519b0325cf,[{'post_id': 'youtube_CTGL8qqjL7s'}]
5544,864995365_8abe257b0eb3da4d,[{'post_id': 'news_freepressjournal_64d7eeff21...


In [5]:
def get_posts(
    reactions: list
) -> list:
    return [post['post_id'] for post in reactions]

In [6]:
na_user_log['reactions'] = [get_posts(reactions) for reactions in na_user_log['reactions']]
na_user_log = na_user_log.explode("reactions")
na_user_log


Unnamed: 0,user_id,reactions
0,1186255453_6b96200b883f6581,native_vren8feb44033d85428330529580bc4f2645
1,864995365_a2b572a31e68cd12,publicvibe_1611854902015326419
2,864995365_9f4b8136228b138e,native_vrhic587c122d164501940a5c7531a73d392
3,1186255453_cf6c4eca3e1f9e84,twitter_1516340979720024071
4,199639326_31fecdd681abf0ef,news_freepressjournal_2e7fb3b377efa188c63ef071...
...,...,...
5542,864995365_6c0516bf8ccc3bf5,news_filmibeat_f50cbf76ef4cf1f366981fbaabec65af
5542,864995365_6c0516bf8ccc3bf5,twitter_1363850965829386241
5543,1035438645_e08329519b0325cf,youtube_CTGL8qqjL7s
5544,864995365_8abe257b0eb3da4d,news_freepressjournal_64d7eeff2197999cbe761d6e...


# Checking how many unique shortlisted users have interaction data

In [7]:
len(na_user_log['user_id'].unique())


5546

# Checking the languages of contents viewed by shortlisted users

In [8]:
na_user_log = pd.merge(na_user_log, posts, left_on="reactions", right_on="post_id", how="inner")
na_user_log.head(10)


Unnamed: 0,user_id,reactions,post_id,ml_language,ml_interests,content
0,1186255453_6b96200b883f6581,native_vren8feb44033d85428330529580bc4f2645,native_vren8feb44033d85428330529580bc4f2645,[en],[lifestyle],{'title': 'All your work will be easy by watch...
1,1186255453_a6408cc67e2615a9,native_vren8feb44033d85428330529580bc4f2645,native_vren8feb44033d85428330529580bc4f2645,[en],[lifestyle],{'title': 'All your work will be easy by watch...
2,1186255453_02b622f10df09c15,native_vren8feb44033d85428330529580bc4f2645,native_vren8feb44033d85428330529580bc4f2645,[en],[lifestyle],{'title': 'All your work will be easy by watch...
3,1186255453_372fee3f5e42cd90,native_vren8feb44033d85428330529580bc4f2645,native_vren8feb44033d85428330529580bc4f2645,[en],[lifestyle],{'title': 'All your work will be easy by watch...
4,1186255453_dbe30c68118a195b,native_vren8feb44033d85428330529580bc4f2645,native_vren8feb44033d85428330529580bc4f2645,[en],[lifestyle],{'title': 'All your work will be easy by watch...
5,864995365_a2b572a31e68cd12,publicvibe_1611854902015326419,publicvibe_1611854902015326419,[hi],"[news, howTo]",{'title': 'बेल्जियन मेलिनॉयस डॉग बन सकते हैं स...
6,864995365_9f4b8136228b138e,native_vrhic587c122d164501940a5c7531a73d392,native_vrhic587c122d164501940a5c7531a73d392,[hi],[lifestyle],{'title': 'जरूर देखें डिवाइडर कार एक्सीडेंट वी...
7,1186255453_57f88415546ee925,native_vrhic587c122d164501940a5c7531a73d392,native_vrhic587c122d164501940a5c7531a73d392,[hi],[lifestyle],{'title': 'जरूर देखें डिवाइडर कार एक्सीडेंट वी...
8,1186255453_cf6c4eca3e1f9e84,twitter_1516340979720024071,twitter_1516340979720024071,[en],[cricket],{'description': 'Bringing back 𝑻𝒉𝒆 𝒀𝒖𝒛𝒊 𝑴𝒂𝒈𝒊𝒄 ...
9,199639326_31fecdd681abf0ef,news_freepressjournal_2e7fb3b377efa188c63ef071...,news_freepressjournal_2e7fb3b377efa188c63ef071...,[en],"[sports, cricket]","{'title': 'IPL 2021 Qualifier 2, KKR vs DC: 5 ..."


In [10]:
na_user_log = na_user_log[["user_id","ml_language"]]
na_user_log = na_user_log.explode("ml_language")
na_user_log = na_user_log.groupby("user_id")['ml_language'].apply(list).reset_index()
na_user_log


Unnamed: 0,user_id,ml_language
0,1035438645_00614e127baf5032,[hi]
1,1035438645_0085cccaab2be238,[hi]
2,1035438645_03de497bc5eb8eaa,[hi]
3,1035438645_073be95b40a9eda3,[en]
4,1035438645_0746f4306c89636c,"[hi, hi, hi]"
...,...,...
5459,u.browser.for.lite.uc.browser_c01e190db6718708,"[hi, en]"
5460,u.browser.for.lite.uc.browser_d80d95cde4777ad7,[en]
5461,u.browser.for.lite.uc.browser_e204aa1960a5427b,"[hi, en, ta]"
5462,u.browser.for.lite.uc.browser_efa957ae19d0a012,"[en, en]"


# Removing duplicacy in language arrays

In [11]:
na_user_log["ml_language"] = [list(set(languages)) for languages in na_user_log["ml_language"]]
na_user_log.rename(columns={"ml_language":"language"}, inplace=True)
na_user_log


Unnamed: 0,user_id,language
0,1035438645_00614e127baf5032,[hi]
1,1035438645_0085cccaab2be238,[hi]
2,1035438645_03de497bc5eb8eaa,[hi]
3,1035438645_073be95b40a9eda3,[en]
4,1035438645_0746f4306c89636c,[hi]
...,...,...
5459,u.browser.for.lite.uc.browser_c01e190db6718708,"[hi, en]"
5460,u.browser.for.lite.uc.browser_d80d95cde4777ad7,[en]
5461,u.browser.for.lite.uc.browser_e204aa1960a5427b,"[hi, ta, en]"
5462,u.browser.for.lite.uc.browser_efa957ae19d0a012,[en]
