<a href="https://colab.research.google.com/github/innocentmatutu/Machine-learning/blob/main/Instagram_Analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import kagglehub
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#Download dataset
path = kagglehub.dataset_download("kundanbedmutha/instagram-analytics-dataset")

#Check what files are inside
os.listdir(path)

csv_path = os.path.join(path,'Instagram_Analytics.csv')
df = pd.read_csv(csv_path)

#Encoding of categorical columns
for col in df.select_dtypes(include=['object']).columns:
  encoder = LabelEncoder()
  df[col] = encoder.fit_transform(df[col])

#Filling missing values with the mean of the numerical column
def impute_with_extension(X_train, X_valid):
  cols_with_missing = [col for col in X_train.columns if df[col].isnull().any()]
  X_train_plus = X_train.copy()
  X_valid_plus = X_valid.copy()

  for col in cols_with_missing:
    X_train_plus[col +'_was_missing'] = X_train_plus.isnull()
    X_valid_plus[col +'_was_missing'] = X_valid_plus.isnull()

  impute = SimpleImputer(strategy='mean')
  imputed_X_train = pd.DataFrame(impute.fit_transform(X_train_plus),columns = X_train_plus.columns)
  imputed_X_valid = pd.DataFrame(impute.transform(X_valid_plus),columns = X_valid_plus.columns)

  return imputed_X_train, imputed_X_valid

#Feature and target selection
y = df['content_category']
df.drop(['content_category'],axis=1,inplace=True)

features = ['likes', 'comments', 'shares',
       'saves', 'reach', 'impressions', 'caption_length', 'hashtags_count',
       'followers_gained', 'traffic_source', 'engagement_rate']
X = df[features]

#Training, testing and splitting of our data
X_train, X_valid, y_train, y_valid = train_test_split(X, y , test_size=0.2, random_state=42)

#Call our function
imputed_X_train, imputed_X_valid = impute_with_extension(X_train, X_valid)

#Model selection, training and prediction
model = xgb.XGBClassifier(n_estimators=500,
                          learning_rate=0.05,
                          eval_metric = 'mlogloss')

model.fit(imputed_X_train, y_train)
prediction = model.predict(imputed_X_valid)

#Metrics
#print(f"Accuracy score: {accuracy_score(y_valid,prediction)}")
#print(f"Precision score: {precision_score(y_valid,prediction, average='weighted')}")
#print(f"Recall score: {recall_score(y_valid,prediction, average='weighted')}")
#print(f"F1 score: {f1_score(y_valid,prediction,average='weighted')}\n")







Using Colab cache for faster access to the 'instagram-analytics-dataset' dataset.


In [1]:
import kagglehub
import pandas as pd
import os

#Download dataset
path = kagglehub.dataset_download("kundanbedmutha/instagram-analytics-dataset")

#Check what files are inside
os.listdir(path)

csv_path = os.path.join(path,'Instagram_Analytics.csv')
df = pd.read_csv(csv_path)

df.columns

Using Colab cache for faster access to the 'instagram-analytics-dataset' dataset.


Index(['post_id', 'upload_date', 'media_type', 'likes', 'comments', 'shares',
       'saves', 'reach', 'impressions', 'caption_length', 'hashtags_count',
       'followers_gained', 'traffic_source', 'engagement_rate',
       'content_category'],
      dtype='object')

In [1]:
import kagglehub
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#Download dataset
path = kagglehub.dataset_download("kundanbedmutha/instagram-analytics-dataset")
csv_path = os.path.join(path, 'Instagram_Analytics.csv')
df = pd.read_csv(csv_path)

#Target + features
y = df['content_category']
df.drop(['content_category'], axis=1, inplace=True)

features = ['likes', 'comments', 'shares', 'saves', 'reach', 'impressions',
            'caption_length', 'hashtags_count', 'followers_gained',
            'traffic_source', 'engagement_rate']

X = df[features]

#Split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

#column types
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
numerical_cols = [col for col in X.columns if X[col].dtype in ['int64','float64']]

#Preprocessing
numerical_transformer = SimpleImputer(strategy='mean')

categorical_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocesser = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

#Model
model = RandomForestClassifier(n_estimators=500, random_state=42)

pipeline = Pipeline(steps=[
    ('preprocesser', preprocesser),
    ('model', model)
])

#Train
pipeline.fit(X_train, y_train)

#Predict
pred = pipeline.predict(X_valid)

#Metrics
#print("Accuracy:", accuracy_score(y_valid, pred))
#print("Precision:", precision_score(y_valid, pred, average='weighted'))
#print("Recall:", recall_score(y_valid, pred, average='weighted'))
#print("F1:", f1_score(y_valid, pred, average='weighted'))


Using Colab cache for faster access to the 'instagram-analytics-dataset' dataset.
Accuracy: 0.09966666666666667
Precision: 0.09925845849876411
Recall: 0.09966666666666667
F1: 0.09934792453013001
