In [94]:
import os
import pandas as pd
import json

## generate profiles data frame

In [95]:

# Define the folder path
folder_path = "profiles"

# Initialize empty lists to store the extracted data
alias_list = []
number_posts_list = []
number_followers_list = []
number_following_list = []
website_list = []

# Iterate over the JSON files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        # Read the JSON file with UTF-8 encoding
        with open(os.path.join(folder_path, filename), encoding='utf-8') as file:
            data = json.load(file)
        
        # Extract the required attributes
        alias = data.get("alias")
        number_posts = data.get("numberPosts")
        number_followers = data.get("numberFollowers")
        number_following = data.get("numberFollowing")
        website = data.get("website")
        
        # Append the data to the respective lists
        alias_list.append(alias)
        number_posts_list.append(number_posts)
        number_followers_list.append(number_followers)
        number_following_list.append(number_following)
        website_list.append(website)

# Create the data frame
data = {
    "alias": alias_list,
    "numberPosts": number_posts_list,
    "numberFollowers": number_followers_list,
    "numberFollowing": number_following_list,
    "website": website_list
}
profiles_df = pd.DataFrame(data)

# Print the data frame
print(profiles_df)


                  alias  numberPosts  numberFollowers  numberFollowing  \
0            1misssmeis          988           720979              233   
1                3ala2o          938           792886              466   
2                   433         6009         14545102              433   
3        6senseofficial         3324           243094                0   
4               7ikhals         1444           219458              221   
..                  ...          ...              ...              ...   
973             _ingo_1          422           149566              127   
974  _mariannejacobsen_         1593           189279              290   
975             _picolo          776           927457              566   
976          _tinamaria          821           160393              730   
977              _tuck4         1623           139150              246   

                                               website  
0                                    www.sylviemeis.de

## Generate posts data frame

In [96]:
# Define the folder path
folder_path = "profiles"

# Initialize empty lists to store the extracted data
alias_list = []
url_image_list = []
is_video_list = []
multiple_image_list = []
tags_list = []
mentions_list = []
description_list = []
date_list = []
number_likes_list = []

# Iterate over the JSON files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        # Read the JSON file with UTF-8 encoding
        with open(os.path.join(folder_path, filename), encoding='utf-8') as file:
            data = json.load(file)
        
        # Extract the alias
        alias = data.get("alias")
        
        # Extract the posts
        posts = data.get("posts", [])
        for post in posts:
            url_image = post.get("urlImage")
            is_video = post.get("isVideo")
            multiple_image = post.get("multipleImage")
            tags = post.get("tags")
            mentions = post.get("mentions")
            description = post.get("description")
            date = post.get("date")
            number_likes = post.get("numberLikes")
            
            # Append the data to the respective lists
            alias_list.append(alias)
            url_image_list.append(url_image)
            is_video_list.append(is_video)
            multiple_image_list.append(multiple_image)
            tags_list.append(tags)
            mentions_list.append(mentions)
            description_list.append(description)
            date_list.append(date)
            number_likes_list.append(number_likes)

# Create the data frame
data = {
    "alias": alias_list,
    "urlImage": url_image_list,
    "isVideo": is_video_list,
    "multipleImage": multiple_image_list,
    "tags": tags_list,
    "mentions": mentions_list,
    "description": description_list,
    "date": date_list,
    "numberLikes": number_likes_list
}
posts_df = pd.DataFrame(data)

# Print the data frame
print(posts_df)


            alias                                           urlImage  isVideo  \
0      1misssmeis  https://scontent.cdninstagram.com/t51.2885-15/...    False   
1      1misssmeis  https://scontent.cdninstagram.com/t51.2885-15/...    False   
2      1misssmeis  https://scontent.cdninstagram.com/t51.2885-15/...    False   
3      1misssmeis  https://scontent.cdninstagram.com/t51.2885-15/...    False   
4      1misssmeis  https://scontent.cdninstagram.com/t51.2885-15/...    False   
...           ...                                                ...      ...   
16534      _tuck4  https://scontent.cdninstagram.com/t51.2885-15/...    False   
16535      _tuck4  https://scontent.cdninstagram.com/t51.2885-15/...    False   
16536      _tuck4  [https://scontent.cdninstagram.com/t51.2885-15...    False   
16537      _tuck4  https://scontent.cdninstagram.com/t51.2885-15/...    False   
16538      _tuck4  https://scontent.cdninstagram.com/t51.2885-15/...    False   

       multipleImage       

## Average number of likes per post

In [97]:

# Group posts_df by alias and calculate the average numberLikes
average_likes = posts_df.groupby('alias')['numberLikes'].mean()

# Add the average_likes column to profiles_df
profiles_df['average_likes'] = profiles_df['alias'].map(average_likes)

# Display the updated profiles_df
print(profiles_df)


                  alias  numberPosts  numberFollowers  numberFollowing  \
0            1misssmeis          988           720979              233   
1                3ala2o          938           792886              466   
2                   433         6009         14545102              433   
3        6senseofficial         3324           243094                0   
4               7ikhals         1444           219458              221   
..                  ...          ...              ...              ...   
973             _ingo_1          422           149566              127   
974  _mariannejacobsen_         1593           189279              290   
975             _picolo          776           927457              566   
976          _tinamaria          821           160393              730   
977              _tuck4         1623           139150              246   

                                               website  average_likes  
0                                    ww

## Filtering

In [98]:
#filtering, such that
#numberFollowers < 1.000.000
#average_likes  < 200.000
profiles_df = profiles_df[(profiles_df['numberFollowers'] < 1000000) & (profiles_df['average_likes'] < 200000)]
print(profiles_df.shape)

#apply filters also to posts_df
posts_df = posts_df[posts_df['alias'].isin(profiles_df['alias'])]
print(posts_df.shape)


(747, 6)
(12697, 9)


## Preprocessing profiles_df

In [99]:
#assigning categories based on the website
def website_available(website):
    if pd.isnull(website):
        return 0
    else:
        return 1

profiles_df['website_available'] = profiles_df['website'].apply(website_available)

profiles_df['website_available'].value_counts()

1    660
0     87
Name: website_available, dtype: int64

## Preprocessing posts_df

### Weekday

In [100]:
#Storing day of the week
posts_df['date'] = pd.to_datetime(posts_df['date'])
posts_df['weekday'] = posts_df['date'].dt.strftime('%A')

### Likes categorization

In [101]:
#Categorizing into 10 equally sized groups based on numberLikes
#Category 10 are the 10% of posts with the highest likes
#Category 1 are the 10% with the lowest likes

# Sort the DataFrame by numberLikes in descending order
posts_df = posts_df.sort_values('numberLikes', ascending=False)

# Calculate the quantiles for the groups
quantiles = pd.qcut(posts_df['numberLikes'], q=10, labels=False, duplicates='drop')

# Assign the group numbers to the numberLikesCategory column
posts_df['numberLikesCategory'] = quantiles + 1  # Add 1 to make the group numbers start from 1 instead of 0

posts_df['numberLikesCategory'].value_counts()
#sorted_df.shape


1     1272
9     1271
5     1271
6     1270
4     1270
10    1269
8     1269
3     1269
7     1268
2     1268
Name: numberLikesCategory, dtype: int64

### Number of relevant hashtags

In [102]:
import re
hashtags = []

#reading hashtags from textfile, adding them to a list of strings
with open('top_500_hashtags.txt', 'r', encoding='utf-8') as file:
    for line in file:
        hashtags.append(line.strip())

cleaned_hashtags = []

#cleaning the strings, i.e. remove index infront of the hashtag an popularity after it
for hashtag in hashtags:
    cleaned_hashtag = re.sub(r'[0-9.]', '', hashtag)[:-1]
    cleaned_hashtags.append(cleaned_hashtag)

print(cleaned_hashtags)

['#love', '#instagood', '#instagram', '#fashion', '#photooftheday', '#beautiful', '#art', '#photography', '#happy', '#picoftheday', '#cute', '#follow', '#tbt', '#followme', '#nature', '#likelike', '#travel', '#style', '#repost', '#summer', '#instadaily', '#selfie', '#me', '#music', '#friends', '#fitness', '#girl', '#food', '#fun', '#beauty', '#instalike', '#smile', '#family', '#photo', '#life', '#likeforlike', '#ootd', '#followfollow', '#makeup', '#amazing', '#igers', '#nofilter', '#dog', '#model', '#sunset', '#beach', '#instamood', '#foodporn', '#motivation', '#followforfollow', '#design', '#lifestyle', '#sky', '#ll', '#ff', '#일상', '#cat', '#handmade', '#hair', '#nails', '#vscocam', '#bestoftheday', '#vsco', '#funny', '#dogsofinstagram', '#drawing', '#artist', '#gym', '#flowers', '#baby', '#wedding', '#girls', '#instapic', '#pretty', '#likeforlikes', '#photographer', '#instafood', '#party', '#inspiration', '#lol', '#cool', '#workout', '#likeforfollow', '#swag', '#fit', '#healthy', '#y

In [103]:
#amount of hastags over all
posts_df['amount_tags'] = posts_df['tags'].apply(lambda x: len(x))

# Function to count relevant tags
def count_relevant_tags(tags, cleaned_hashtags):
    return sum(tag in cleaned_hashtags for tag in tags)

# Apply the function to each row and store the result in a new column
posts_df['amount_relevant_tags'] = posts_df.apply(lambda row: count_relevant_tags(row['tags'], cleaned_hashtags), axis=1)

posts_df['amount_relevant_tags'].value_counts()

0     11006
1       916
2       281
3       200
4       127
5        65
8        25
6        20
7        16
10        9
11        9
9         8
12        4
15        3
13        3
14        2
17        1
16        1
19        1
Name: amount_relevant_tags, dtype: int64

### Calculating means

In [104]:
#Returns rows with same alias and smaller date than date in parameter
def filter_dataframe_by_alias_and_date(df, alias, date):
    # Convert the date parameter to a Timestamp object
    date = pd.Timestamp(date)

    # Filter the DataFrame based on the given conditions
    filtered_df = df[(df['alias'] == alias) & (df['date'] < date)]

    # Return the filtered DataFrame
    return filtered_df.sort_values('date', ascending=False)

def moving_average(df, row):
    filtered_df = filter_dataframe_by_alias_and_date(df, row['alias'], row['date'])
    
    if len(filtered_df) < 5:
        return -1
    else:
        return filtered_df.head(5)['numberLikes'].mean()

    
posts_df['moving_avg'] = posts_df.apply(lambda row: moving_average(posts_df, row), axis=1)



## NLP

In [105]:
import nltk
import re
import string
import emoji
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anwender\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [106]:
def remove_punctuation(text):
    # Remove punctuation using regular expressions
    no_punct = re.sub('[' + string.punctuation + string.digits + ']', '', text)
    return no_punct

def remove_stopwords(text):
    # Remove stopwords using NLTK corpus
    stop_words = set(stopwords.words('english'))
    no_stopwords = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return no_stopwords

def remove_emojis(text):
    # Convert emojis to textual representation and remove them
    no_emojis = emoji.demojize(text)
    no_emojis = re.sub('(:[a-z_-]+:)', ' ', no_emojis, flags=re.IGNORECASE)
    return no_emojis

posts_df['descriptionProcessed'] = posts_df['description'].apply(remove_punctuation)
posts_df['descriptionProcessed'] = posts_df['descriptionProcessed'].apply(remove_stopwords)
posts_df['descriptionProcessed'] = posts_df['descriptionProcessed'].apply(remove_emojis)


In [107]:
#by default the vectorizer conerts the text to lower case and uses word-level tokenization
# Create an instance of CountVectorizer with max_features set to 500 (this is what they did in the tds implementation)
vec = CountVectorizer(max_features=500)


# Transform the "descriptionProcessed" column into a matrix of token counts
description_counts = vec.fit_transform(posts_df['descriptionProcessed'])

# Convert the matrix to an array
description_counts_array = description_counts.toarray()

df = pd.DataFrame(data=description_counts_array,columns = vec.get_feature_names_out())
print(df.shape)
print(posts_df.shape)


(12697, 500)
(12697, 15)


## Language detection

In [108]:
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "Unknown"

# Assuming 'description' is the column name in your DataFrame
posts_df['language'] = posts_df['description'].apply(detect_language)


posts_df['language'].value_counts()

en         9276
es          476
Unknown     406
pt          324
ru          241
it          200
fr          195
de          148
tr          128
ar          125
id          120
no          108
af          103
ca           75
nl           71
et           70
so           69
tl           63
cy           61
fi           50
ro           47
sv           43
pl           42
da           34
th           29
vi           25
fa           22
ko           20
ja           19
sw           16
sl           11
bg           11
sk           11
hr           10
mk           10
hu            9
sq            8
lv            6
zh-tw         5
lt            4
cs            3
uk            2
zh-cn         1
Name: language, dtype: int64

## Merging

In [109]:
columns_to_keep = ['alias', 'numberPosts', 'numberFollowers', 'numberFollowing', 'website_available']
profiles_df = profiles_df[columns_to_keep]
profiles_df.columns

Index(['alias', 'numberPosts', 'numberFollowers', 'numberFollowing',
       'website_available'],
      dtype='object')

In [110]:
columns_to_keep = ['alias', 'multipleImage', 'weekday', 'numberLikesCategory', 'amount_relevant_tags', 'moving_avg', 'descriptionProcessed', 'language']
posts_df = posts_df[posts_df['language'] == 'en']
posts_df = posts_df[columns_to_keep]
posts_df

Unnamed: 0,alias,multipleImage,weekday,numberLikesCategory,amount_relevant_tags,moving_avg,descriptionProcessed,language
4472,elisabeth.rioux,False,Saturday,10,0,119771.0,OFFICIALLY PARENTS Yes Jonathan came live C...,en
16495,_picolo,False,Friday,10,0,70003.8,Going watch Beauty Beast tonight Hoping wor...,en
4482,elisabeth.rioux,False,Wednesday,10,0,111266.4,days Ill way see love life India hes reason ha...,en
4480,elisabeth.rioux,False,Monday,10,0,116196.8,degrees celcius Mumbai wear pants difficult ...,en
4475,elisabeth.rioux,False,Monday,10,0,121036.4,Missing much island think beautiful Caribbean ...,en
...,...,...,...,...,...,...,...,...
15011,the_fabcloset,False,Sunday,1,1,122.0,Sunday stop smell flowers Unless allergies sto...,en
15266,trainforfitspo,False,Sunday,1,0,1640.4,Email celinefrazierfitnessgmailcom head pag...,en
10166,mensflair,False,Monday,1,8,528.4,Courtesy thevasco suit suits gentlemen gentlem...,en
3249,colerise,False,Thursday,1,0,-1.0,end road western point jamaica taken rum hand ...,en


In [111]:
merged_df = posts_df.merge(profiles_df, on='alias', how='inner')
merged_df = merged_df[merged_df['moving_avg'] != -1]
#merged_df.to_csv('merged_data.csv', index=True, sep=';')
merged_df

Unnamed: 0,alias,multipleImage,weekday,numberLikesCategory,amount_relevant_tags,moving_avg,descriptionProcessed,language,numberPosts,numberFollowers,numberFollowing,website_available
0,elisabeth.rioux,False,Saturday,10,0,119771.0,OFFICIALLY PARENTS Yes Jonathan came live C...,en,554,990729,266,1
1,elisabeth.rioux,False,Wednesday,10,0,111266.4,days Ill way see love life India hes reason ha...,en,554,990729,266,1
2,elisabeth.rioux,False,Monday,10,0,116196.8,degrees celcius Mumbai wear pants difficult ...,en,554,990729,266,1
3,elisabeth.rioux,False,Monday,10,0,121036.4,Missing much island think beautiful Caribbean ...,en,554,990729,266,1
4,elisabeth.rioux,False,Tuesday,10,0,123242.8,Small nose small cheeks big lips big eyes flee...,en,554,990729,266,1
...,...,...,...,...,...,...,...,...,...,...,...,...
9266,wristtakers,False,Saturday,1,0,117.6,New beautiful colors Check silicon beaded brac...,en,15,148876,2085,1
9269,wristtakers,False,Friday,1,0,103.6,new bracelets going website tomorrow sure chec...,en,15,148876,2085,1
9270,wristtakers,False,Saturday,1,0,117.6,Dress AppleWatch wristtakers silicon beaded br...,en,15,148876,2085,1
9271,wristtakers,False,Friday,1,0,103.6,stock Equality rainbow silicon beaded bracelet...,en,15,148876,2085,1


In [112]:
#by default the vectorizer conerts the text to lower case and uses word-level tokenization
# Create an instance of CountVectorizer with max_features set to 500 (this is what they did in the tds implementation)
vec = CountVectorizer(max_features=500)


# Transform the "descriptionProcessed" column into a matrix of token counts
description_counts = vec.fit_transform(merged_df['descriptionProcessed'])

# Convert the matrix to an array
description_counts_array = description_counts.toarray()

word_vectors = pd.DataFrame(data=description_counts_array,columns = vec.get_feature_names_out())
print(word_vectors.shape)


(6314, 500)


In [114]:
final_df = pd.concat([merged_df.reset_index(drop=True), word_vectors.reset_index(drop=True)], axis=1)
final_df = final_df.drop("language", axis=1)
final_df.to_csv('final_data.csv', index=True, sep=';')

In [115]:
final_df

Unnamed: 0,alias,multipleImage,weekday,numberLikesCategory,amount_relevant_tags,moving_avg,descriptionProcessed,numberPosts,numberFollowers,numberFollowing,...,year,years,yes,yesterday,yet,yoga,you,young,youre,youtube
0,elisabeth.rioux,False,Saturday,10,0,119771.0,OFFICIALLY PARENTS Yes Jonathan came live C...,554,990729,266,...,0,0,1,0,0,0,0,0,0,1
1,elisabeth.rioux,False,Wednesday,10,0,111266.4,days Ill way see love life India hes reason ha...,554,990729,266,...,0,0,0,0,0,0,0,0,0,0
2,elisabeth.rioux,False,Monday,10,0,116196.8,degrees celcius Mumbai wear pants difficult ...,554,990729,266,...,0,0,0,0,0,0,0,0,0,0
3,elisabeth.rioux,False,Monday,10,0,121036.4,Missing much island think beautiful Caribbean ...,554,990729,266,...,0,0,0,0,0,0,0,0,0,0
4,elisabeth.rioux,False,Tuesday,10,0,123242.8,Small nose small cheeks big lips big eyes flee...,554,990729,266,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6309,wristtakers,False,Saturday,1,0,117.6,New beautiful colors Check silicon beaded brac...,15,148876,2085,...,0,0,0,0,0,0,0,0,0,0
6310,wristtakers,False,Friday,1,0,103.6,new bracelets going website tomorrow sure chec...,15,148876,2085,...,0,0,0,0,0,0,0,0,0,0
6311,wristtakers,False,Saturday,1,0,117.6,Dress AppleWatch wristtakers silicon beaded br...,15,148876,2085,...,0,0,0,0,0,0,0,0,0,0
6312,wristtakers,False,Friday,1,0,103.6,stock Equality rainbow silicon beaded bracelet...,15,148876,2085,...,0,0,0,0,0,0,0,0,0,0
