In [178]:
import os
import pandas as pd
import json

## generate profiles data frame

In [179]:

# Define the folder path
folder_path = "profiles"

# Initialize empty lists to store the extracted data
alias_list = []
number_posts_list = []
number_followers_list = []
number_following_list = []
website_list = []

# Iterate over the JSON files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        # Read the JSON file with UTF-8 encoding
        with open(os.path.join(folder_path, filename), encoding='utf-8') as file:
            data = json.load(file)
        
        # Extract the required attributes
        alias = data.get("alias")
        number_posts = data.get("numberPosts")
        number_followers = data.get("numberFollowers")
        number_following = data.get("numberFollowing")
        website = data.get("website")
        
        # Append the data to the respective lists
        alias_list.append(alias)
        number_posts_list.append(number_posts)
        number_followers_list.append(number_followers)
        number_following_list.append(number_following)
        website_list.append(website)

# Create the data frame
data = {
    "alias": alias_list,
    "numberPosts": number_posts_list,
    "numberFollowers": number_followers_list,
    "numberFollowing": number_following_list,
    "website": website_list
}
profiles_df = pd.DataFrame(data)

# Print the data frame
print(profiles_df)


                  alias  numberPosts  numberFollowers  numberFollowing  \
0            1misssmeis          988           720979              233   
1                3ala2o          938           792886              466   
2                   433         6009         14545102              433   
3        6senseofficial         3324           243094                0   
4               7ikhals         1444           219458              221   
..                  ...          ...              ...              ...   
973             _ingo_1          422           149566              127   
974  _mariannejacobsen_         1593           189279              290   
975             _picolo          776           927457              566   
976          _tinamaria          821           160393              730   
977              _tuck4         1623           139150              246   

                                               website  
0                                    www.sylviemeis.de

## Generate posts data frame

In [180]:
# Define the folder path
folder_path = "profiles"

# Initialize empty lists to store the extracted data
alias_list = []
url_image_list = []
is_video_list = []
multiple_image_list = []
tags_list = []
mentions_list = []
description_list = []
date_list = []
number_likes_list = []

# Iterate over the JSON files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        # Read the JSON file with UTF-8 encoding
        with open(os.path.join(folder_path, filename), encoding='utf-8') as file:
            data = json.load(file)
        
        # Extract the alias
        alias = data.get("alias")
        
        # Extract the posts
        posts = data.get("posts", [])
        for post in posts:
            url_image = post.get("urlImage")
            is_video = post.get("isVideo")
            multiple_image = post.get("multipleImage")
            tags = post.get("tags")
            mentions = post.get("mentions")
            description = post.get("description")
            date = post.get("date")
            number_likes = post.get("numberLikes")
            
            # Append the data to the respective lists
            alias_list.append(alias)
            url_image_list.append(url_image)
            is_video_list.append(is_video)
            multiple_image_list.append(multiple_image)
            tags_list.append(tags)
            mentions_list.append(mentions)
            description_list.append(description)
            date_list.append(date)
            number_likes_list.append(number_likes)

# Create the data frame
data = {
    "alias": alias_list,
    "urlImage": url_image_list,
    "isVideo": is_video_list,
    "multipleImage": multiple_image_list,
    "tags": tags_list,
    "mentions": mentions_list,
    "description": description_list,
    "date": date_list,
    "numberLikes": number_likes_list
}
posts_df = pd.DataFrame(data)

# Print the data frame
print(posts_df)


            alias                                           urlImage  isVideo  \
0      1misssmeis  https://scontent.cdninstagram.com/t51.2885-15/...    False   
1      1misssmeis  https://scontent.cdninstagram.com/t51.2885-15/...    False   
2      1misssmeis  https://scontent.cdninstagram.com/t51.2885-15/...    False   
3      1misssmeis  https://scontent.cdninstagram.com/t51.2885-15/...    False   
4      1misssmeis  https://scontent.cdninstagram.com/t51.2885-15/...    False   
...           ...                                                ...      ...   
16534      _tuck4  https://scontent.cdninstagram.com/t51.2885-15/...    False   
16535      _tuck4  https://scontent.cdninstagram.com/t51.2885-15/...    False   
16536      _tuck4  [https://scontent.cdninstagram.com/t51.2885-15...    False   
16537      _tuck4  https://scontent.cdninstagram.com/t51.2885-15/...    False   
16538      _tuck4  https://scontent.cdninstagram.com/t51.2885-15/...    False   

       multipleImage       

## Average number of likes per post

In [181]:

# Group posts_df by alias and calculate the average numberLikes
average_likes = posts_df.groupby('alias')['numberLikes'].mean()

# Add the average_likes column to profiles_df
profiles_df['average_likes'] = profiles_df['alias'].map(average_likes)

# Display the updated profiles_df
print(profiles_df)


                  alias  numberPosts  numberFollowers  numberFollowing  \
0            1misssmeis          988           720979              233   
1                3ala2o          938           792886              466   
2                   433         6009         14545102              433   
3        6senseofficial         3324           243094                0   
4               7ikhals         1444           219458              221   
..                  ...          ...              ...              ...   
973             _ingo_1          422           149566              127   
974  _mariannejacobsen_         1593           189279              290   
975             _picolo          776           927457              566   
976          _tinamaria          821           160393              730   
977              _tuck4         1623           139150              246   

                                               website  average_likes  
0                                    ww

## Filtering

In [182]:
#filtering, such that
#numberFollowers < 1.000.000
#average_likes  < 200.000
profiles_df = profiles_df[(profiles_df['numberFollowers'] < 1000000) & (profiles_df['average_likes'] < 200000)]
print(profiles_df.shape)

#apply filters also to posts_df
posts_df = posts_df[posts_df['alias'].isin(profiles_df['alias'])]
print(posts_df.shape)


(747, 6)
(12697, 9)


## Preprocessing profiles_df

In [183]:
#assigning categories based on the website
def website_available(website):
    if pd.isnull(website):
        return 0
    else:
        return 1

profiles_df['website_available'] = profiles_df['website'].apply(website_available)

profiles_df['website_available'].value_counts()

1    660
0     87
Name: website_available, dtype: int64

## Preprocessing posts_df

### Weekday

In [184]:
#Storing day of the week
posts_df['date'] = pd.to_datetime(posts_df['date'])
posts_df['weekday'] = posts_df['date'].dt.strftime('%A')

### Likes categorization

In [185]:
#Categorizing into 10 equally sized groups based on numberLikes
#Category 10 are the 10% of posts with the highest likes
#Category 1 are the 10% with the lowest likes

# Sort the DataFrame by numberLikes in descending order
posts_df = posts_df.sort_values('numberLikes', ascending=False)

# Calculate the quantiles for the groups
quantiles = pd.qcut(posts_df['numberLikes'], q=10, labels=False, duplicates='drop')

# Assign the group numbers to the numberLikesCategory column
posts_df['numberLikesCategory'] = quantiles + 1  # Add 1 to make the group numbers start from 1 instead of 0

posts_df['numberLikesCategory'].value_counts()
#sorted_df.shape


1     1272
9     1271
5     1271
6     1270
4     1270
10    1269
8     1269
3     1269
7     1268
2     1268
Name: numberLikesCategory, dtype: int64

### Number of relevant hashtags

In [186]:
import re
hashtags = []

#reading hashtags from textfile, adding them to a list of strings
with open('top_500_hashtags.txt', 'r', encoding='utf-8') as file:
    for line in file:
        hashtags.append(line.strip())

cleaned_hashtags = []

#cleaning the strings, i.e. remove index infront of the hashtag an popularity after it
for hashtag in hashtags:
    cleaned_hashtag = re.sub(r'[0-9.]', '', hashtag)[:-1]
    cleaned_hashtags.append(cleaned_hashtag)

print(cleaned_hashtags)

['#love', '#instagood', '#instagram', '#fashion', '#photooftheday', '#beautiful', '#art', '#photography', '#happy', '#picoftheday', '#cute', '#follow', '#tbt', '#followme', '#nature', '#likelike', '#travel', '#style', '#repost', '#summer', '#instadaily', '#selfie', '#me', '#music', '#friends', '#fitness', '#girl', '#food', '#fun', '#beauty', '#instalike', '#smile', '#family', '#photo', '#life', '#likeforlike', '#ootd', '#followfollow', '#makeup', '#amazing', '#igers', '#nofilter', '#dog', '#model', '#sunset', '#beach', '#instamood', '#foodporn', '#motivation', '#followforfollow', '#design', '#lifestyle', '#sky', '#ll', '#ff', '#일상', '#cat', '#handmade', '#hair', '#nails', '#vscocam', '#bestoftheday', '#vsco', '#funny', '#dogsofinstagram', '#drawing', '#artist', '#gym', '#flowers', '#baby', '#wedding', '#girls', '#instapic', '#pretty', '#likeforlikes', '#photographer', '#instafood', '#party', '#inspiration', '#lol', '#cool', '#workout', '#likeforfollow', '#swag', '#fit', '#healthy', '#y

In [187]:
#amount of hastags over all
posts_df['amount_tags'] = posts_df['tags'].apply(lambda x: len(x))

# Function to count relevant tags
def count_relevant_tags(tags, cleaned_hashtags):
    return sum(tag in cleaned_hashtags for tag in tags)

# Apply the function to each row and store the result in a new column
posts_df['amount_relevant_tags'] = posts_df.apply(lambda row: count_relevant_tags(row['tags'], cleaned_hashtags), axis=1)

posts_df['amount_relevant_tags'].value_counts()

0     11006
1       916
2       281
3       200
4       127
5        65
8        25
6        20
7        16
10        9
11        9
9         8
12        4
15        3
13        3
14        2
17        1
16        1
19        1
Name: amount_relevant_tags, dtype: int64

### Calculating means

In [188]:
#Returns rows with same alias and smaller date than date in parameter
def filter_dataframe_by_alias_and_date(df, alias, date):
    # Convert the date parameter to a Timestamp object
    date = pd.Timestamp(date)

    # Filter the DataFrame based on the given conditions
    filtered_df = df[(df['alias'] == alias) & (df['date'] < date)]

    # Return the filtered DataFrame
    return filtered_df.sort_values('date', ascending=False)

def moving_average(df, row):
    filtered_df = filter_dataframe_by_alias_and_date(df, row['alias'], row['date'])
    
    if len(filtered_df) < 5:
        return -1
    else:
        return filtered_df.head(5)['numberLikes'].mean()

    
posts_df['moving_avg'] = posts_df.apply(lambda row: moving_average(posts_df, row), axis=1)



## NLP

In [189]:
import nltk
import re
import string
import emoji
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\86178\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [190]:
def remove_punctuation(text):
    # Remove punctuation using regular expressions
    no_punct = re.sub('[' + string.punctuation + string.digits + ']', '', text)
    return no_punct

def remove_stopwords(text):
    # Remove stopwords using NLTK corpus
    stop_words = set(stopwords.words('english'))
    no_stopwords = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return no_stopwords

def remove_emojis(text):
    # Convert emojis to textual representation and remove them
    no_emojis = emoji.demojize(text)
    no_emojis = re.sub('(:[a-z_-]+:)', ' ', no_emojis, flags=re.IGNORECASE)
    return no_emojis
posts_df['descriptionProcessed'] = posts_df['description'].apply(remove_punctuation)
posts_df['descriptionProcessed'] = posts_df['descriptionProcessed'].apply(remove_stopwords)
posts_df['descriptionProcessed'] = posts_df['descriptionProcessed'].apply(remove_emojis)


In [191]:

def load_embeddings(file="wiki-news-300d-1M.vec"):
    embeddings = {}
    with open(file, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= 100000:
                break
            tokens = line.rstrip().split(' ')
            embeddings[tokens[0]] = np.asarray(tokens[1:], dtype='float64')

    return embeddings


def tokenize(sentence):
    tokens = nltk.word_tokenize(sentence)
    return tokens


def embed_sentence(sentence, word2emb):
    tokens = tokenize(sentence)
    token_embeddings = []
    for token in tokens:
        if token in word2emb:
            token_embeddings.append(word2emb[token])
        else:
            token_embeddings.append(np.zeros(300))
    if len(token_embeddings) > 0:
        sentence_embedding = np.mean(token_embeddings, axis=0)
    else:
        sentence_embedding = np.zeros(300)
    return sentence_embedding



In [192]:

embeddings = load_embeddings()
posts_df['descriptionVector'] = None
posts_df['descriptionVector'] = \
    [embed_sentence(sentence, embeddings) for sentence in posts_df['descriptionProcessed']]

In [193]:
#by default the vectorizer conerts the text to lower case and uses word-level tokenization
# Create an instance of CountVectorizer with max_features set to 500 (this is what they did in the tds implementation)
vec = CountVectorizer(max_features=500)


# Transform the "descriptionProcessed" column into a matrix of token counts
description_counts = vec.fit_transform(posts_df['descriptionProcessed'])

# Convert the matrix to an array
description_counts_array = description_counts.toarray()

df = pd.DataFrame(data=description_counts_array,columns = vec.get_feature_names_out())
print(df.shape)
print(posts_df.shape)


(12697, 500)
(12697, 16)


## Language detection

In [194]:
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "Unknown"

# Assuming 'description' is the column name in your DataFrame
posts_df['language'] = posts_df['description'].apply(detect_language)
posts_df = posts_df[posts_df['language'] != 'Unknown']


posts_df['language'].value_counts()
posts_df

Unnamed: 0,alias,urlImage,isVideo,multipleImage,tags,mentions,description,date,numberLikes,weekday,numberLikesCategory,amount_tags,amount_relevant_tags,moving_avg,descriptionProcessed,descriptionVector,language
4472,elisabeth.rioux,https://scontent.cdninstagram.com/t51.2885-15/...,False,False,[],"[@jonathan_germain, @elisabeth.rioux, @hoaka_s...","WE'RE OFFICIALLY PARENTS ❤️❤️ Yes, Jonathan ca...",2017-04-29 05:00:00+00:00,180164,Saturday,10,0,0,119771.0,OFFICIALLY PARENTS Yes Jonathan came live C...,"[-0.015670370370370366, 0.01253703703703704, -...",en
16495,_picolo,https://scontent.cdninstagram.com/t51.2885-15/...,False,False,"[#beautyandthebeast, #leuchtturm1917]","[@faber_castell_br, @leuchtturm1917]",Going to watch Beauty and the Beast tonight! 🥀...,2017-03-24 05:00:00+00:00,134824,Friday,10,2,0,70003.8,Going watch Beauty Beast tonight Hoping wor...,"[-0.07210869565217391, -0.025652173913043478, ...",en
4482,elisabeth.rioux,https://scontent.cdninstagram.com/t51.2885-15/...,False,False,[],"[@fashionnova, @wrstbhvr, @jonathan_germain, @...",In 2 days I'll be on my way to see the love of...,2017-04-05 05:00:00+00:00,132790,Wednesday,10,0,0,111266.4,days Ill way see love life India hes reason ha...,"[-0.04034375, -0.010975, -0.04198125, -0.05997...",en
4480,elisabeth.rioux,https://scontent.cdninstagram.com/t51.2885-15/...,False,False,[],"[@runwaydreamz, @jonathan_germain, @lackofcolo...",40 degrees celcius in Mumbai and I have to wea...,2017-04-10 05:00:00+00:00,129201,Monday,10,0,0,116196.8,degrees celcius Mumbai wear pants difficult ...,"[-0.013634782608695651, 0.00902173913043478, -...",en
4475,elisabeth.rioux,https://scontent.cdninstagram.com/t51.2885-15/...,False,False,[],"[@hoaka_swimwear, @arubatourism, @hoaka_swimwear]","Missing this so much, which island do you thin...",2017-04-24 05:00:00+00:00,128430,Monday,10,0,0,121036.4,Missing much island think beautiful Caribbean ...,"[0.0012222222222222218, 0.0050444444444444425,...",en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15011,the_fabcloset,https://scontent.cdninstagram.com/t51.2885-15/...,False,False,"[#spring, #splendid_flowers, #lookdujour, #fas...",[],Sunday stop and smell the flowers. Unless you ...,2017-04-30 05:00:00+00:00,36,Sunday,1,6,1,122.0,Sunday stop smell flowers Unless allergies sto...,"[-0.04809411764705882, 0.010547058823529413, -...",en
15266,trainforfitspo,https://scontent.cdninstagram.com/t51.2885-15/...,False,False,[],"[@gmail, @celinefrazier, @celinefrazier, @gmail]",💕 Email me at 💌celinefrazierfitness@gmail.com ...,2017-04-30 05:00:00+00:00,29,Sunday,1,0,0,1640.4,Email celinefrazierfitnessgmailcom head pag...,"[-0.007543333333333338, 0.011746666666666666, ...",en
10166,mensflair,https://scontent.cdninstagram.com/t51.2885-15/...,False,False,"[#suit, #suits, #gentlemen, #gentlemens, #fash...",[@the_vasco],Courtesy of @the_vasco \n_____________________...,2017-05-01 05:00:00+00:00,17,Monday,1,29,8,528.4,Courtesy thevasco suit suits gentlemen gentlem...,"[-0.01659, -0.012190000000000001, -0.010776666...",en
3249,colerise,https://scontent.cdninstagram.com/t51.2885-15/...,False,False,[#feelthevibejamaica],[@visitjamaica],end of the road at the most western point of j...,1970-01-01 00:00:00+00:00,0,Thursday,1,1,0,-1.0,end road western point jamaica taken rum hand ...,"[0.0178, -0.002629999999999998, -0.03778, -0.0...",en


## Merging

In [195]:
columns_to_keep = ['alias', 'numberPosts', 'numberFollowers', 'numberFollowing', 'website_available']
profiles_df = profiles_df[columns_to_keep]
profiles_df.columns

Index(['alias', 'numberPosts', 'numberFollowers', 'numberFollowing',
       'website_available'],
      dtype='object')

In [196]:
columns_to_keep = ['alias', 'multipleImage', 'weekday', 'numberLikesCategory', 'amount_relevant_tags', 'moving_avg', 'descriptionProcessed','descriptionVector', 'language']
posts_df = posts_df[columns_to_keep]
posts_df

Unnamed: 0,alias,multipleImage,weekday,numberLikesCategory,amount_relevant_tags,moving_avg,descriptionProcessed,descriptionVector,language
4472,elisabeth.rioux,False,Saturday,10,0,119771.0,OFFICIALLY PARENTS Yes Jonathan came live C...,"[-0.015670370370370366, 0.01253703703703704, -...",en
16495,_picolo,False,Friday,10,0,70003.8,Going watch Beauty Beast tonight Hoping wor...,"[-0.07210869565217391, -0.025652173913043478, ...",en
4482,elisabeth.rioux,False,Wednesday,10,0,111266.4,days Ill way see love life India hes reason ha...,"[-0.04034375, -0.010975, -0.04198125, -0.05997...",en
4480,elisabeth.rioux,False,Monday,10,0,116196.8,degrees celcius Mumbai wear pants difficult ...,"[-0.013634782608695651, 0.00902173913043478, -...",en
4475,elisabeth.rioux,False,Monday,10,0,121036.4,Missing much island think beautiful Caribbean ...,"[0.0012222222222222218, 0.0050444444444444425,...",en
...,...,...,...,...,...,...,...,...,...
15011,the_fabcloset,False,Sunday,1,1,122.0,Sunday stop smell flowers Unless allergies sto...,"[-0.04809411764705882, 0.010547058823529413, -...",en
15266,trainforfitspo,False,Sunday,1,0,1640.4,Email celinefrazierfitnessgmailcom head pag...,"[-0.007543333333333338, 0.011746666666666666, ...",en
10166,mensflair,False,Monday,1,8,528.4,Courtesy thevasco suit suits gentlemen gentlem...,"[-0.01659, -0.012190000000000001, -0.010776666...",en
3249,colerise,False,Thursday,1,0,-1.0,end road western point jamaica taken rum hand ...,"[0.0178, -0.002629999999999998, -0.03778, -0.0...",en


In [197]:
merged_df = posts_df.merge(profiles_df, on='alias', how='inner')
merged_df = merged_df[merged_df['moving_avg'] != -1]
#merged_df.to_csv('merged_data.csv', index=True, sep=';')
merged_df

Unnamed: 0,alias,multipleImage,weekday,numberLikesCategory,amount_relevant_tags,moving_avg,descriptionProcessed,descriptionVector,language,numberPosts,numberFollowers,numberFollowing,website_available
0,elisabeth.rioux,False,Saturday,10,0,119771.0,OFFICIALLY PARENTS Yes Jonathan came live C...,"[-0.015670370370370366, 0.01253703703703704, -...",en,554,990729,266,1
1,elisabeth.rioux,False,Wednesday,10,0,111266.4,days Ill way see love life India hes reason ha...,"[-0.04034375, -0.010975, -0.04198125, -0.05997...",en,554,990729,266,1
2,elisabeth.rioux,False,Monday,10,0,116196.8,degrees celcius Mumbai wear pants difficult ...,"[-0.013634782608695651, 0.00902173913043478, -...",en,554,990729,266,1
3,elisabeth.rioux,False,Monday,10,0,121036.4,Missing much island think beautiful Caribbean ...,"[0.0012222222222222218, 0.0050444444444444425,...",en,554,990729,266,1
4,elisabeth.rioux,False,Tuesday,10,0,123242.8,Small nose small cheeks big lips big eyes flee...,"[-0.020484545454545457, -0.03314818181818183, ...",en,554,990729,266,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12281,wristtakers,False,Saturday,1,0,117.6,New beautiful colors Check silicon beaded brac...,"[0.102875, 0.012449999999999998, 0.01742499999...",en,15,148876,2085,1
12284,wristtakers,False,Friday,1,0,103.6,new bracelets going website tomorrow sure chec...,"[0.013516666666666665, 0.011391666666666663, -...",en,15,148876,2085,1
12285,wristtakers,False,Saturday,1,0,117.6,Dress AppleWatch wristtakers silicon beaded br...,"[0.13026666666666667, 0.026099999999999995, 0....",en,15,148876,2085,1
12286,wristtakers,False,Friday,1,0,103.6,stock Equality rainbow silicon beaded bracelet...,"[0.10581250000000002, -0.009400000000000002, -...",en,15,148876,2085,1


In [198]:
#by default the vectorizer conerts the text to lower case and uses word-level tokenization
# Create an instance of CountVectorizer with max_features set to 500 (this is what they did in the tds implementation)
vec = CountVectorizer(max_features=500)


# Transform the "descriptionProcessed" column into a matrix of token counts
description_counts = vec.fit_transform(merged_df['descriptionProcessed'])

# Convert the matrix to an array
description_counts_array = description_counts.toarray()

word_vectors = pd.DataFrame(data=description_counts_array,columns = vec.get_feature_names_out())
print(word_vectors.shape)


(8356, 500)


In [199]:
# final_df = pd.concat([merged_df.reset_index(drop=True), word_vectors.reset_index(drop=True)], axis=1)
final_df = merged_df.drop("language", axis=1)
final_df.to_csv('final_data with both English and nonEnglish.csv', index=True, sep=';')

In [200]:
final_df

Unnamed: 0,alias,multipleImage,weekday,numberLikesCategory,amount_relevant_tags,moving_avg,descriptionProcessed,descriptionVector,numberPosts,numberFollowers,numberFollowing,website_available
0,elisabeth.rioux,False,Saturday,10,0,119771.0,OFFICIALLY PARENTS Yes Jonathan came live C...,"[-0.015670370370370366, 0.01253703703703704, -...",554,990729,266,1
1,elisabeth.rioux,False,Wednesday,10,0,111266.4,days Ill way see love life India hes reason ha...,"[-0.04034375, -0.010975, -0.04198125, -0.05997...",554,990729,266,1
2,elisabeth.rioux,False,Monday,10,0,116196.8,degrees celcius Mumbai wear pants difficult ...,"[-0.013634782608695651, 0.00902173913043478, -...",554,990729,266,1
3,elisabeth.rioux,False,Monday,10,0,121036.4,Missing much island think beautiful Caribbean ...,"[0.0012222222222222218, 0.0050444444444444425,...",554,990729,266,1
4,elisabeth.rioux,False,Tuesday,10,0,123242.8,Small nose small cheeks big lips big eyes flee...,"[-0.020484545454545457, -0.03314818181818183, ...",554,990729,266,1
...,...,...,...,...,...,...,...,...,...,...,...,...
12281,wristtakers,False,Saturday,1,0,117.6,New beautiful colors Check silicon beaded brac...,"[0.102875, 0.012449999999999998, 0.01742499999...",15,148876,2085,1
12284,wristtakers,False,Friday,1,0,103.6,new bracelets going website tomorrow sure chec...,"[0.013516666666666665, 0.011391666666666663, -...",15,148876,2085,1
12285,wristtakers,False,Saturday,1,0,117.6,Dress AppleWatch wristtakers silicon beaded br...,"[0.13026666666666667, 0.026099999999999995, 0....",15,148876,2085,1
12286,wristtakers,False,Friday,1,0,103.6,stock Equality rainbow silicon beaded bracelet...,"[0.10581250000000002, -0.009400000000000002, -...",15,148876,2085,1


In [201]:
final_df_onlyEnglish=merged_df[merged_df['language']=='en']
final_df_onlyEnglish = final_df_onlyEnglish.drop("language", axis=1)
final_df_onlyEnglish.to_csv('final_data with only english.csv', index=True, sep=';')

In [202]:
final_df_onlyEnglish

Unnamed: 0,alias,multipleImage,weekday,numberLikesCategory,amount_relevant_tags,moving_avg,descriptionProcessed,descriptionVector,numberPosts,numberFollowers,numberFollowing,website_available
0,elisabeth.rioux,False,Saturday,10,0,119771.0,OFFICIALLY PARENTS Yes Jonathan came live C...,"[-0.015670370370370366, 0.01253703703703704, -...",554,990729,266,1
1,elisabeth.rioux,False,Wednesday,10,0,111266.4,days Ill way see love life India hes reason ha...,"[-0.04034375, -0.010975, -0.04198125, -0.05997...",554,990729,266,1
2,elisabeth.rioux,False,Monday,10,0,116196.8,degrees celcius Mumbai wear pants difficult ...,"[-0.013634782608695651, 0.00902173913043478, -...",554,990729,266,1
3,elisabeth.rioux,False,Monday,10,0,121036.4,Missing much island think beautiful Caribbean ...,"[0.0012222222222222218, 0.0050444444444444425,...",554,990729,266,1
4,elisabeth.rioux,False,Tuesday,10,0,123242.8,Small nose small cheeks big lips big eyes flee...,"[-0.020484545454545457, -0.03314818181818183, ...",554,990729,266,1
...,...,...,...,...,...,...,...,...,...,...,...,...
12281,wristtakers,False,Saturday,1,0,117.6,New beautiful colors Check silicon beaded brac...,"[0.102875, 0.012449999999999998, 0.01742499999...",15,148876,2085,1
12284,wristtakers,False,Friday,1,0,103.6,new bracelets going website tomorrow sure chec...,"[0.013516666666666665, 0.011391666666666663, -...",15,148876,2085,1
12285,wristtakers,False,Saturday,1,0,117.6,Dress AppleWatch wristtakers silicon beaded br...,"[0.13026666666666667, 0.026099999999999995, 0....",15,148876,2085,1
12286,wristtakers,False,Friday,1,0,103.6,stock Equality rainbow silicon beaded bracelet...,"[0.10581250000000002, -0.009400000000000002, -...",15,148876,2085,1


In [203]:
final_df_without_English=merged_df[merged_df['language']!='en']
final_df_without_English = final_df_without_English.drop("language", axis=1)
final_df_without_English.to_csv('final_data without english.csv', index=True, sep=';')

In [204]:
final_df_without_English

Unnamed: 0,alias,multipleImage,weekday,numberLikesCategory,amount_relevant_tags,moving_avg,descriptionProcessed,descriptionVector,numberPosts,numberFollowers,numberFollowing,website_available
8,elisabeth.rioux,False,Thursday,10,0,114961.0,Mood,"[0.0253, 0.02, -0.1144, 0.0359, -0.022, 0.0643...",554,990729,266,1
40,cacatengker,False,Monday,10,0,42904.8,pretty bridal robe mandmatelier,"[0.05155, 0.03275, 0.067425, -0.113, -0.015999...",411,963746,748,1
44,cacatengker,False,Wednesday,10,0,34591.4,stage deoentertainment ebimoektidecor :...,"[0.004233333333333333, -0.017241666666666666, ...",411,963746,748,1
46,cacatengker,False,Wednesday,10,0,34591.4,hero Mba Kesayangan marlenehariman,"[-0.001775, -0.014325, 0.032075, -0.0013, -0.0...",411,963746,748,1
48,cacatengker,True,Sunday,10,0,71916.0,little party never killed nobody fromtengkerto...,"[0.028483333333333333, -0.023900000000000005, ...",411,963746,748,1
...,...,...,...,...,...,...,...,...,...,...,...,...
12246,tomer_gelb,False,Friday,1,1,343.4,bmw vorsteiner widebody,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",117,395357,2322,0
12247,tomer_gelb,False,Tuesday,1,1,420.6,ferrari california,"[0.004, 0.0398, 0.0, 0.0122, 0.0461, -0.05335,...",117,395357,2322,0
12251,tomer_gelb,False,Friday,1,0,343.4,audemarspiguet ap,"[-0.0678, -0.0226, 0.0039, -0.0909, 0.0601, 0....",117,395357,2322,0
12258,plriley,False,Sunday,1,0,180.8,Azul naranja,"[0.20175, -0.04705, -0.00035, 0.0882, -0.07795...",2476,144002,395,1


In [205]:
mask = final_df_without_English['descriptionVector'].apply(lambda x: np.all(np.array(x) == 0))

# Count the number of True values in the mask
num_all_zero_vectors = mask.sum()
num_all_zero_vectors

306

In [206]:
mask2 = final_df_onlyEnglish['descriptionVector'].apply(lambda x: np.all(np.array(x) == 0))

# Count the number of True values in the mask
num_all_zero_vectors2 = mask2.sum()
num_all_zero_vectors2

97