In [1]:
import os
import pandas as pd
import json


## generate profiles data frame

In [2]:

# Define the folder path
folder_path = "profiles"

# Initialize empty lists to store the extracted data
alias_list = []
number_posts_list = []
number_followers_list = []
number_following_list = []
website_list = []

# Iterate over the JSON files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        # Read the JSON file with UTF-8 encoding
        with open(os.path.join(folder_path, filename), encoding='utf-8') as file:
            data = json.load(file)
        
        # Extract the required attributes
        alias = data.get("alias")
        number_posts = data.get("numberPosts")
        number_followers = data.get("numberFollowers")
        number_following = data.get("numberFollowing")
        website = data.get("website")
        
        # Append the data to the respective lists
        alias_list.append(alias)
        number_posts_list.append(number_posts)
        number_followers_list.append(number_followers)
        number_following_list.append(number_following)
        website_list.append(website)

# Create the data frame
data = {
    "alias": alias_list,
    "numberPosts": number_posts_list,
    "numberFollowers": number_followers_list,
    "numberFollowing": number_following_list,
    "website": website_list
}
profiles_df = pd.DataFrame(data)

# Print the data frame
print(profiles_df)


                  alias  numberPosts  numberFollowers  numberFollowing  \
0            1misssmeis          988           720979              233   
1                3ala2o          938           792886              466   
2                   433         6009         14545102              433   
3        6senseofficial         3324           243094                0   
4               7ikhals         1444           219458              221   
..                  ...          ...              ...              ...   
973             _ingo_1          422           149566              127   
974  _mariannejacobsen_         1593           189279              290   
975             _picolo          776           927457              566   
976          _tinamaria          821           160393              730   
977              _tuck4         1623           139150              246   

                                               website  
0                                    www.sylviemeis.de

## Generate posts data frame

In [3]:
import os
import pandas as pd
import json

# Define the folder path
folder_path = "profiles"

# Initialize empty lists to store the extracted data
alias_list = []
url_image_list = []
is_video_list = []
multiple_image_list = []
tags_list = []
mentions_list = []
description_list = []
date_list = []
number_likes_list = []

# Iterate over the JSON files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        # Read the JSON file with UTF-8 encoding
        with open(os.path.join(folder_path, filename), encoding='utf-8') as file:
            data = json.load(file)
        
        # Extract the alias
        alias = data.get("alias")
        
        # Extract the posts
        posts = data.get("posts", [])
        for post in posts:
            url_image = post.get("urlImage")
            is_video = post.get("isVideo")
            multiple_image = post.get("multipleImage")
            tags = post.get("tags")
            mentions = post.get("mentions")
            description = post.get("description")
            date = post.get("date")
            number_likes = post.get("numberLikes")
            
            # Append the data to the respective lists
            alias_list.append(alias)
            url_image_list.append(url_image)
            is_video_list.append(is_video)
            multiple_image_list.append(multiple_image)
            tags_list.append(tags)
            mentions_list.append(mentions)
            description_list.append(description)
            date_list.append(date)
            number_likes_list.append(number_likes)

# Create the data frame
data = {
    "alias": alias_list,
    "urlImage": url_image_list,
    "isVideo": is_video_list,
    "multipleImage": multiple_image_list,
    "tags": tags_list,
    "mentions": mentions_list,
    "description": description_list,
    "date": date_list,
    "numberLikes": number_likes_list
}
posts_df = pd.DataFrame(data)

# Print the data frame
print(posts_df)


            alias                                           urlImage  isVideo  \
0      1misssmeis  https://scontent.cdninstagram.com/t51.2885-15/...    False   
1      1misssmeis  https://scontent.cdninstagram.com/t51.2885-15/...    False   
2      1misssmeis  https://scontent.cdninstagram.com/t51.2885-15/...    False   
3      1misssmeis  https://scontent.cdninstagram.com/t51.2885-15/...    False   
4      1misssmeis  https://scontent.cdninstagram.com/t51.2885-15/...    False   
...           ...                                                ...      ...   
16534      _tuck4  https://scontent.cdninstagram.com/t51.2885-15/...    False   
16535      _tuck4  https://scontent.cdninstagram.com/t51.2885-15/...    False   
16536      _tuck4  [https://scontent.cdninstagram.com/t51.2885-15...    False   
16537      _tuck4  https://scontent.cdninstagram.com/t51.2885-15/...    False   
16538      _tuck4  https://scontent.cdninstagram.com/t51.2885-15/...    False   

       multipleImage       

## Average number of likes per post

In [4]:

# Group posts_df by alias and calculate the average numberLikes
average_likes = posts_df.groupby('alias')['numberLikes'].mean()

# Add the average_likes column to profiles_df
profiles_df['average_likes'] = profiles_df['alias'].map(average_likes)

# Display the updated profiles_df
print(profiles_df)


                  alias  numberPosts  numberFollowers  numberFollowing  \
0            1misssmeis          988           720979              233   
1                3ala2o          938           792886              466   
2                   433         6009         14545102              433   
3        6senseofficial         3324           243094                0   
4               7ikhals         1444           219458              221   
..                  ...          ...              ...              ...   
973             _ingo_1          422           149566              127   
974  _mariannejacobsen_         1593           189279              290   
975             _picolo          776           927457              566   
976          _tinamaria          821           160393              730   
977              _tuck4         1623           139150              246   

                                               website  average_likes  
0                                    ww

## Filtering

In [10]:
#filtering, such that
#numberFollowers < 1.000.000
#average_likes  < 200.000
profiles_df = profiles_df[(profiles_df['numberFollowers'] < 1000000) & (profiles_df['average_likes'] < 200000)]
print(profiles_df.shape)

#apply filters also to posts_df
posts_df = posts_df[posts_df['alias'].isin(profiles_df['alias'])]
print(posts_df.shape)


(747, 7)
(12697, 10)


## Preprocessing profiles_df

In [14]:
#assigning categories based on the website
def categorize_website(website):
    if pd.isnull(website):
        return 'None'
    elif 'youtube' in website.lower():
        return 'Youtube'
    elif 'facebook' in website.lower():
        return 'Facebook'
    elif 'twitter' in website.lower():
        return 'Twitter'
    elif 'blog' in website.lower():
        return 'Blog'
    elif 'music' in website.lower() or 'spotify' in website.lower():
        return 'Music'
    else:
        return 'Other'

profiles_df['website_category'] = profiles_df['website'].apply(categorize_website)

profiles_df['website_category'].value_counts()

Other       576
None         87
Facebook     33
Youtube      28
Blog         19
Twitter       3
Music         1
Name: website_category, dtype: int64

## Preprocessing posts_df

In [12]:
#Storing day of the week
posts_df['date'] = pd.to_datetime(posts_df['date'])
posts_df['weekday'] = posts_df['date'].dt.strftime('%A')

In [26]:
#Categorizing into 10 equally sized groups based on numberLikes
#Category 10 are the 10% of posts with the highest likes
#Category 1 are the 10% with the lowest likes

# Sort the DataFrame by numberLikes in descending order
posts_df = posts_df.sort_values('numberLikes', ascending=False)

# Calculate the quantiles for the groups
quantiles = pd.qcut(posts_df['numberLikes'], q=10, labels=False, duplicates='drop')

# Assign the group numbers to the numberLikesCategory column
posts_df['numberLikesCategory'] = quantiles + 1  # Add 1 to make the group numbers start from 1 instead of 0

posts_df['numberLikesCategory'].value_counts()
#sorted_df.shape


1     1272
9     1271
5     1271
6     1270
4     1270
10    1269
8     1269
3     1269
7     1268
2     1268
Name: numberLikesCategory, dtype: int64

## NLP

In [41]:
import nltk
import re
import string
import emoji
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anwender\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [48]:
def remove_punctuation(text):
    # Remove punctuation using regular expressions
    no_punct = re.sub('['+string.punctuation+']', '', text)
    return no_punct

def remove_stopwords(text):
    # Remove stopwords using NLTK corpus
    stop_words = set(stopwords.words('english'))
    no_stopwords = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return no_stopwords

def remove_emojis(text):
    # Convert emojis to textual representation and remove them
    no_emojis = emoji.demojize(text)
    no_emojis = re.sub('(:[a-z_-]+:)', ' ', no_emojis)
    return no_emojis

posts_df['descriptionProcessed'] = posts_df['description'].apply(remove_punctuation)
posts_df['descriptionProcessed'] = posts_df['descriptionProcessed'].apply(remove_stopwords)
posts_df['descriptionProcessed'] = posts_df['descriptionProcessed'].apply(remove_emojis)


In [63]:
#by default the vectorizer conerts the text to lower case and uses word-level tokenization
# Create an instance of CountVectorizer with max_features set to 500 (this is what they did in the tds implementation)
vec = CountVectorizer(max_features=500)


# Transform the "descriptionProcessed" column into a matrix of token counts
description_counts = vec.fit_transform(posts_df['descriptionProcessed'])

# Convert the matrix to an array
description_counts_array = description_counts.toarray()

df = pd.DataFrame(data=description_counts_array,columns = vec.get_feature_names_out())
print(df.shape)
print(posts_df.shape)


(12697, 500)
(12697, 13)


## Language detection

In [60]:
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "Unknown"

# Assuming 'description' is the column name in your DataFrame
posts_df['language'] = posts_df['description'].apply(detect_language)


posts_df['language'].value_counts()

en         9277
es          479
Unknown     406
pt          321
ru          248
fr          208
it          199
de          147
tr          130
ar          121
id          117
no          107
af          102
ca           80
et           72
so           71
tl           61
nl           60
cy           56
fi           49
ro           43
sv           41
pl           40
da           35
th           30
vi           26
fa           25
ko           20
ja           18
sw           18
sl           13
bg           11
sk           11
mk           10
hr            9
sq            8
hu            8
zh-tw         5
lt            5
lv            5
cs            2
uk            2
zh-cn         1
Name: language, dtype: int64