In [1]:
import pandas as pd
import numpy as np
import emot
print("Imports successful")



Imports successful


In [2]:
def read_data(path: str) -> pd.DataFrame:
    df = pd.read_json(path)
    print(f"Read data from {path} successfully")
    return df

path = "posts.json"
df = read_data(path)
print(df.head())


Read data from posts.json successfully
                                inputUrl                   id     type  \
0  https://www.instagram.com/tarini_shah  3242389063277678592    Video   
1  https://www.instagram.com/tarini_shah  3124157038573506048  Sidecar   
2  https://www.instagram.com/tarini_shah  3101602434564655104    Video   
3  https://www.instagram.com/tarini_shah  3409072589826453504    Video   
4  https://www.instagram.com/tarini_shah  3404828945839726592  Sidecar   

     shortCode                                            caption hashtags  \
0  Cz_R6yURw-J  Tag a person you would witness this peace and ...       []   
1  CtbPEYqtc3c  our BIG reveal ⭐️\n\nAgasthya and I shot for t...       []   
2  CsLGvsytoi5                sapne sajane ki humko mili wajeh ✈️       []   
3  C9PdXVaIG__  been wearing this everyday because \n\n✨it’s s...       []   
4  C9AYeL9vqA-  pick your poison babe, I’m poison either way 🎱...       []   

                         mentions              

In [3]:
def replace_emojis_with_text(text: str) -> str:
    emot_obj = emot.emot()
    try:
        emoji_info = emot_obj.emoji(text)
        num_emojis = len(emoji_info["value"])
        for i in range(num_emojis):
            text = text.replace(emoji_info["value"][i], emoji_info["mean"][i])
    except Exception as e:
        print(f"An error occurred while processing the text: {text}. The error is: {e}")
    return text

# Test this function
print(replace_emojis_with_text("I love 😊 and 🌟"))


I love :smiling_face_with_smiling_eyes: and :glowing_star:


In [4]:

# Function to process the DataFrame
def process_data(df: pd.DataFrame) -> pd.DataFrame:
    print("Starting data processing")
    
    # Filter early to reduce DataFrame size
    df = df[df["type"] != "Video"]
    df = df[df["likesCount"] != -1.0]
    df = df[df["images"].notna()]
    df = df[df["images"].apply(len) > 0]

    # Selecting relevant columns and renaming them
    df = df[["id", "commentsCount", "likesCount", "latestComments", "images"]]
    df.columns = ["id", "n_comments", "n_likes", "comments", "image"]

    # Selecting the first image if there are multiple images
    df["image"] = df["image"].str[0]

    # Extracting text from comments
    df["comments"] = df["comments"].apply(lambda x: [i["text"] for i in x if "text" in i])

    # Resetting the id
    df.reset_index(drop=True, inplace=True)
    df["id"] = df.index + 1

    # Converting empty lists in comments to np.nan and creating a separate dataframe for comments
    df["comments"] = df["comments"].apply(lambda x: x if x else np.nan)
    df_comments = df.explode("comments")[["id", "comments"]]

    # Replace emojis in comments with their text descriptions
    df_comments["comments"] = df_comments["comments"].apply(replace_emojis_with_text)

    # Removing comments column from the original dataframe
    df = df.drop("comments", axis=1)

    print("Data processing completed")
    return df, df_comments

# Test this function
path = "posts.json"
df = read_data(path)
df_processed, df_comments_processed = process_data(df)
print(df_processed.head())
print(df_comments_processed.head())


Read data from posts.json successfully
Starting data processing
Data processing completed
   id  n_comments  n_likes                                              image
0   1         374   255816  https://scontent.cdninstagram.com/v/t51.29350-...
1   2          75    32099  https://scontent.cdninstagram.com/v/t39.30808-...
2   3          46    38328  https://scontent.cdninstagram.com/v/t39.30808-...
3   4         492   130388  https://scontent.cdninstagram.com/v/t51.29350-...
4   5        1257   303812  https://scontent.cdninstagram.com/v/t51.29350-...
   id                        comments
0   1            Arey they cousins???
0   1                              Bb
0   1                           .b ..
0   1                               B
0   1  :smiling_face_with_heart-eyes:


In [5]:
def save_data(df: pd.DataFrame, df_comments: pd.DataFrame) -> None:
    df.to_csv("only_posts.csv", index=False)
    df_comments.to_csv("posts_comments.csv", index=False, sep=';')
    print("Data saved successfully")

# Test this function
save_data(df_processed, df_comments_processed)


Data saved successfully


In [6]:
def main():
    path = "posts.json"
    path2 = "posts2.json"
    
    # Read, process and save the data
    df_1 = read_data(path)
    df_2 = read_data(path2)
    df = pd.concat([df_1, df_2])
    df, df_comments = process_data(df)
    save_data(df, df_comments)
    
    
if __name__ == "__main__":
    main()


Read data from posts.json successfully
Read data from posts2.json successfully
Starting data processing
Data processing completed
Data saved successfully
