In [None]:
import numpy as np
import pandas as pd 
import warnings
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

In [None]:
#import data
df = pd.read_csv('../input/instagram-reach/instagram_reach.csv')

In [None]:
#head of data
df.head()

In [None]:
#check info of data
print(df.info())

There are missing values on Caption column

# Data Cleaning

## Remove unnecessary column

In [None]:
#because there is only 6 main features (columns), "Unnamed:0" and "S.No" columns will removed
df.drop(['Unnamed: 0','S.No'], inplace=True,axis=1)

## Dealing with missing values

In [None]:
#only Caption column that has missing values so missing values will be dropped
df.dropna(inplace=True)

## Change data type & column name

In [None]:
#change column name 
df.rename(columns={'USERNAME':'Username'}, inplace=True)

#change time since posted into hours
time_since_posted_in_hours=[]
for i in df['Time since posted']:
    time_since_posted_in_hours.append(i.replace('hours', ''))
se = pd.Series(time_since_posted_in_hours)
df['Time since posted (hours)'] = se.values
df['Time since posted (hours)'] = pd.to_numeric(df['Time since posted (hours)'])
df = df.drop('Time since posted', axis=1)
    
#change caption, username, hashtag into string
df[['Username', 'Caption','Hashtags']].astype(str, copy=False)   

In [None]:
#check new data
df.head()

# Exploratory Data Analysis

In [None]:
#Distribution of followers 
fig = px.box(df, x='Followers', title='Distribution of followers')
fig.show()

Most of the account has under 1.5k followers

In [None]:
#Distribution of followers 
fig = px.box(df, x='Likes', title='Distribution of likes')
fig.show()

Most of the posts have under 50 likes

In [None]:
#Distribution of followers 
fig = px.box(df, x='Time since posted (hours)', title='Distribution of followers')
fig.show()

Most of the posts posted less then 5 hours ago

In [None]:
print(f"There are {df['Username'].nunique()} unique users")

In [None]:
#caption wordcloud
text = " ".join(i for i in df.Caption)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, width=1600, height=800).generate(text)
plt.style.use('classic')
plt.figure(figsize=(20,10), facecolor='k')
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

From wordcloud above we know that words like 'AI', 'will', 'new' are the top three words on caption

In [None]:
#hashtags wordcloud
text = " ".join(i for i in df.Hashtags)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, width=1600, height=800).generate(text)
#plt.style.use('classic')
plt.figure(figsize=(20,10), facecolor='k')
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

From wordcloud above we know that words like 'artificialintelligence', 'machinelearning', 'AI' are the top three words on hashtags

## Relationship between two variables

In [None]:
fig = px.scatter(df, x='Time since posted (hours)', y='Likes', 
                 title='Likes vs Time since posted')
fig.show()

There is a linear relationship between likes and time since posted even though most of data shows no relationship

In [None]:
fig = px.scatter(df, x='Followers', y='Likes', 
                 title='Likes vs Followes')
fig.show()

There is a linear relationship between these two variables. The relationship is positive on some data and negative on some data

In [None]:
fig = px.imshow(df[['Followers', 'Likes', 'Time since posted (hours)']].corr(), text_auto=True)
fig.show()

- There is a weak positive correlation between likes and followers
- There is a weak positive correlation between likes and time since posted
- There is a strong positive relationship between time since posted and likes

## Modelling

In [None]:
#Standardizing

Predicting reach with XGBRegressor

In [None]:
#Define x variables and y variable
#x = df[['Followers', 'Time since posted (hours)']]
#y = df['Likes']

#Split train set and test set
#X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
#Fit the model and predict the value of test set
#model = XGBRegressor()
#model.fit(X_train, y_train)

#Define model evaluation method
#cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)
#Evaluate the model
#scores = cross_val_score(model, X_test, y_test, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
#Absolute MAE
#scores = absolute(scores)
#print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()))

The XGBoost model achieved a mean MAE of 18.272 and MAE std of 8.974

Using linear regression

In [None]:
#lin_reg = LinearRegression()
#lin_reg.fit(X_train, y_train)

#Define model evaluation method
#cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)
#Evaluate the model
#scores = cross_val_score(lin_reg, X_test, y_test, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
#Absolute MAE
#scores = absolute(scores)
#print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

The Linear Regression model achieved a mean MAE of 17.737 and MAE std of 11.099

In [None]:
#check coefficient of determination of linear regression model
#r_sq = lin_reg.score(x, y)
#print(f"coefficient of determination: {r_sq}")

Linear Regression model has smaller mean of MAE than XGBoost model but has low R Square

In [None]:
#Predict new likes using XGBoost
#X_new = np.array([[240, 7]])
#model.predict(X_new)

Summary

This is how you can analyze and predict the likes of Instagram posts with machine learning using Python. With this notebook, we could know the example of Data Science role in social media. I hope you could give feedback and advice to this notebook. Thank you.

References : 

https://thecleverprogrammer.com/2022/03/22/instagram-reach-analysis-using-python/
https://machinelearningmastery.com/xgboost-for-regression/