### Load the Dataset

In [1]:
import pandas as pd
# Load the dataset
df = pd.read_csv('instagram_reach.csv')


### Explore the Data

In [6]:
# Display basic information about the dataset
print(df.info())
# Check for missing values
print(df.isnull().sum())
print(df.columns)
# Explore the distribution of target variables
df[['Likes', 'Time since posted']].describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         100 non-null    int64 
 1   S.No               100 non-null    int64 
 2   USERNAME           100 non-null    object
 3   Caption            94 non-null     object
 4   Followers          100 non-null    int64 
 5   Hashtags           100 non-null    object
 6   Time since posted  100 non-null    object
 7   Likes              100 non-null    int64 
dtypes: int64(4), object(4)
memory usage: 6.4+ KB
None
Unnamed: 0           0
S.No                 0
USERNAME             0
Caption              6
Followers            0
Hashtags             0
Time since posted    0
Likes                0
dtype: int64
Index(['Unnamed: 0', 'S.No', 'USERNAME', 'Caption', 'Followers', 'Hashtags',
       'Time since posted', 'Likes'],
      dtype='object')


Unnamed: 0,Likes
count,100.0
mean,46.48
std,55.08698
min,8.0
25%,19.0
50%,29.0
75%,46.0
max,349.0


### Data Pre Processing

In [21]:
# Handle missing values (if any)
df.fillna(method='ffill', inplace=True)
# Extract numerical part and convert to numeric
df['Time since posted'] = df['Time since posted'].str.extract('(\d+)').astype(float)



### Split the data

In [30]:
from sklearn.model_selection import train_test_split
# Define input features (X) and target variables (y)
X = df.drop(['Likes', 'Time since posted','USERNAME','Caption','Hashtags'], axis=1)
y_likes = df['Likes']
y_time_since_posted = df['Time since posted']

# Split the data into training and testing sets
X_train, X_test, y_likes_train, y_likes_test, y_time_train, y_time_test = train_test_split(X, y_likes, y_time_since_posted, test_size=0.2, random_state=42)


In [31]:
print(X_train)

    Unnamed: 0  S.No  Followers
55          16    24       3448
88           1    10       1158
26          26    30        265
42           3    11        273
69           7    19       1003
..         ...   ...        ...
60          21    29        145
71           9    24        383
14          14    18       2904
92           5    15        106
51          12    20       2277

[80 rows x 3 columns]


In [32]:
print(y_likes_train)

55    349
88     29
26     25
42     41
69     40
     ... 
60     16
71     50
14     28
92     12
51    157
Name: Likes, Length: 80, dtype: int64


### Build and Train the Model:


In [33]:
from sklearn.linear_model import LinearRegression

# Create separate models for predicting likes and Time Since Posted
likes_model = LinearRegression()
time_model = LinearRegression()
# Train the models
likes_model.fit(X_train, y_likes_train)
time_model.fit(X_train, y_time_train)


### Evaluate the model

In [34]:
from sklearn.metrics import mean_squared_error, r2_score
# Make predictions
y_likes_pred = likes_model.predict(X_test)
y_time_pred = time_model.predict(X_test)

# Evaluate the models
likes_mse = mean_squared_error(y_likes_test, y_likes_pred)
time_mse = mean_squared_error(y_time_test, y_time_pred)

likes_r2 = r2_score(y_likes_test, y_likes_pred)
time_r2 = r2_score(y_time_test, y_time_pred)

print(f'Likes Model MSE: {likes_mse}, R2 Score: {likes_r2}')
print(f'Time Since Posted Model MSE: {time_mse}, R2 Score: {time_r2}')


Likes Model MSE: 1272.3383135794722, R2 Score: 0.10248756208561693
Time Since Posted Model MSE: 9.479049588569747, R2 Score: 0.18617303382101358
