In [8]:
pip install scikit-learn


Collecting scikit-learnNote: you may need to restart the kernel to use updated packages.

  Downloading scikit_learn-1.3.2-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.5.0 (from scikit-learn)
  Using cached scipy-1.11.4-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)
Downloading scikit_learn-1.3.2-cp311-cp311-win_amd64.whl (9.2 MB)
   ---------------------------------------- 0.0/9.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.2 MB 393.8 kB/s eta 0:00:24
   ---------------------------------------- 0.1/9.2 MB 655.4 kB/s eta 0:00:14
    --------------------------------------- 0.2/9.2 MB 871.5 kB/s eta 0:00:11
    --------------------------------------- 0.2/9.2 MB 841.6 kB/s eta 0:00:11
    --------------------------------------- 0.2/9.2 MB 841.6 kB/s eta 0:00:11
   


[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [36]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, accuracy_score
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
import joblib

def load_and_preprocess_image(image_path):
    img = load_img(image_path, target_size=(224, 224))
    img_array = img_to_array(img)
    return preprocess_input(img_array)

def preprocess_data(csv_path, image_folder):
    df = pd.read_csv(csv_path)

    # Additional feature: Load all image paths from the folder
    df['image_path'] = df['image_path'].apply(lambda x: os.path.join(image_folder, x.strip('"')))

    # Load and preprocess images in batches
    batch_size = 100
    image_batches = [df['image_path'][i:i + batch_size].apply(load_and_preprocess_image) for i in range(0, len(df), batch_size)]
    df['image_data'] = pd.concat(image_batches, axis=0).tolist()

    # Additional feature engineering
    df['day_of_week'] = pd.to_datetime(df['time']).dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['caption_length'] = df['caption'].apply(lambda x: len(str(x)))
    df['followers_times_comments'] = df['follower_count_at_t'] * df['no_of_comments']
    df['image_mean'] = df['image_data'].apply(lambda x: np.mean(x))
    df['image_std'] = df['image_data'].apply(lambda x: np.std(x))

    return df

def train_regression_model(X, y):
    model = RandomForestRegressor()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    print(f'Mean Absolute Error (Regression): {mae}')
    return model

def train_classification_model(X, y):
    model = RandomForestClassifier()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy (Classification): {accuracy}')
    return model

def perform_hyperparameter_tuning(model, param_grid, X, y, scoring):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring=scoring, n_jobs=-1)
    grid_search.fit(X, y)
    best_params = grid_search.best_params_
    print(f"Best Hyperparameters: {best_params}")
    return grid_search.best_estimator_

def save_models(regression_model, classification_model):
    joblib.dump(regression_model, 'regression_model.joblib')
    joblib.dump(classification_model, 'classification_model.joblib')

def load_models():
    regression_model = joblib.load('regression_model.joblib')
    classification_model = joblib.load('classification_model.joblib')
    return regression_model, classification_model

def make_predictions(regression_model, classification_model, new_data_meta, new_data_images):
    regression_prediction = regression_model.predict(new_data_meta)
    classification_prediction = classification_model.predict(new_data_meta)
    return regression_prediction, classification_prediction

if __name__ == "__main__":
    csv_path ="C:\\Users\\Ngugi\\Downloads\\instagram_data.csv"
    image_folder = "C:\\Users\\Ngugi\\Music\\Desktop\\Data\\insta_data"

    # Preprocess data
    df = preprocess_data(csv_path, image_folder)

    # Features for regression
    X_meta_regression = df[['no_of_comments', 't', 'follower_count_at_t', 'image_mean', 'image_std']]
    y_regression = df['likes']

    # Features for classification
    threshold = 100  # Adjust threshold as needed
    X_meta_classification = df[['no_of_comments', 't', 'follower_count_at_t', 'image_mean', 'image_std']]
    y_classification = (df['likes'] > threshold).astype(int)

    # Train regression model
    regression_model = train_regression_model(X_meta_regression, y_regression)

    # Train classification model
    classification_model = train_classification_model(X_meta_classification, y_classification)

    # Hyperparameter tuning
    param_grid_regression = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20],
                              'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}

    param_grid_classification = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20],
                                  'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}

    tuned_regression_model = perform_hyperparameter_tuning(regression_model, param_grid_regression, X_meta_regression, y_regression, 'neg_mean_absolute_error')
    tuned_classification_model = perform_hyperparameter_tuning(classification_model, param_grid_classification, X_meta_classification, y_classification, 'accuracy')

    # Save models
    save_models(tuned_regression_model, tuned_classification_model)

    # Load models
    loaded_regression_model, loaded_classification_model = load_models()

    # Example: Make predictions for new data
    new_data_meta = pd.DataFrame({'no_of_comments': [10], 't': [0], 'follower_count_at_t': [1000],
                                  'image_mean': [0.5], 'image_std': [0.2]})
    new_data_images = load_and_preprocess_image('path_to_image.jpg')  # Replace with the path to a new image

    regression_prediction, classification_prediction = make_predictions(loaded_regression_model, loaded_classification_model, new_data_meta, new_data_images)

    print(f"Regression Prediction: {regression_prediction}")
    print(f"Classification Prediction: {classification_prediction}")


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Ngugi\\Music\\Desktop\\Data\\insta_data\\../Data/insta_data/0.jpg'