<a href="https://colab.research.google.com/github/khan-hashim/Intro-To-Artificial-Intelligence/blob/main/YouTube_Ranker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importing libraries

In [395]:
#Importing the correct functions/libraries
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error as mae
from sklearn.linear_model import  Lasso
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn import set_config
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


#Loadind the Dataset

In [None]:
# Try reading the CSV file with different encodings to find the correct one
url = "https://raw.githubusercontent.com/Vishek12/EECS-3401-Project/main/Global_YouTube_Statistics.csv"
encodings_to_try = ['utf-8', 'latin1', 'utf-16']
for encoding in encodings_to_try:
    try:
        youtube_data = pd.read_csv(url, encoding=encoding)
        break
    except UnicodeDecodeError:
        continue

print(youtube_data.shape)
#Displaying the data



# Deleting unnecessary columns

In [None]:
# Display the shape of the DataFrame before unimportant dropping columns
print(youtube_data.shape)

columns_to_drop = ['rank','Title','Youtuber','Country','Abbreviation','channel_type','created_date','created_month','Latitude', 'Longitude']
youtube_data = youtube_data.drop(columns=columns_to_drop)

#Display the truncated data
print(youtube_data.shape)


# 3. EDA on the Dataset

1) Histograms to understand the distribution of Data

In [None]:
youtube_data.hist(figsize=(24, 16))


In [399]:
# As we can see from histogram Created Year has a wrong value so before moving on we will address that issue
youtube_data.drop(youtube_data.loc[youtube_data['created_year'] == 1970].index, inplace=True)

2) Correlation with regard to our target

In [None]:
corr_matrix = youtube_data.corr(numeric_only=True)
corr_matrix["subscribers"].sort_values(ascending=False)

3) Visualizing different coorelation

In [None]:
# Scatter Plot to show correlation between video views and subscribers
X = youtube_data["video views"]
y = youtube_data["subscribers"]
# Plot points
fig, pl = plt.subplots(figsize=(20, 10))
pl.scatter(X, y, color = 'b')
plt.xlabel("video views")
plt.ylabel("subscribers")


In [None]:
# Scatter Plot to show correlation between uploads and subscribers
X = youtube_data["uploads"]
y = youtube_data["subscribers"]
# Plot points
fig, pl = plt.subplots(figsize=(20, 10))
pl.scatter(X, y, color = 'b')
plt.xlabel("uploads")
plt.ylabel("subscribers")


In [None]:
# Line Graph between population and subscribers
PopulationVsSubscribers = sns.lineplot(x="Population", y="subscribers", data=youtube_data, errorbar=None)

In [None]:
# Line Graph between year channel was created and subscribers
CreatedYearVsSubscribers = sns.lineplot(x="created_year", y="subscribers", data=youtube_data, errorbar=None)

# 4. Preparing the data for Machine Learning Algorithms

In [None]:
#Pipeline the data so that we can remove duplicate rows and fill empty values

#Duplicate entries in the code
print("Data Duplicates before transformation pipelines:")
duplicates = youtube_data.duplicated().sum() #Expected 0
print(duplicates)
#Just in case we do have duplicates
if(youtube_data.duplicated().sum() > 0):
    youtube_data.drop_duplicates(inplace=True)


# Assuming youtube_data is your DataFrame

# Find the number of missing entries in the dataset
missing_values = youtube_data.isna().sum()
print("\nMissing Values Before: ")
print(missing_values)


In [None]:
# Scaling target column to get cleaner results from the models
youtube_data['subscribers_scaled by 1,000,000'] = youtube_data['subscribers'] / 1_000_000
youtube_data.drop('subscribers', axis=1, inplace=True)
youtube_data.head()

In [None]:
# Split the dataset into number features and categorical features
num_cols = youtube_data.select_dtypes(include='number').columns.to_list()
cat_cols = youtube_data.select_dtypes(exclude='number').columns.to_list()
num_cols.remove("subscribers_scaled by 1,000,000")

# Create pipelines for numeric and categorical columns
num_pipeline = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
cat_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(sparse=False))


# Set the estimators
preprocessing = ColumnTransformer([('num', num_pipeline, num_cols),
                                   ('cat', cat_pipeline, cat_cols)],
                                    remainder='passthrough')

preprocessing

In [None]:
# Running our data through the pipeline
youtube_prepped = preprocessing.fit_transform(youtube_data)

# # # Concatenating the feature names for our prepped data

feature_names=preprocessing.get_feature_names_out()
youtube_prepped = pd.DataFrame(data=youtube_prepped, columns=feature_names)


print(youtube_prepped.shape)
youtube_prepped.head()

In [None]:
# # Double Checking if all missing features have been taken care of
missing_values = youtube_prepped.isna().sum()
print("\nMissing Values After: ")
missing_values

In [None]:
#Split the Dataset into 80% for training and 20% for testing

X = youtube_prepped.drop(['remainder__subscribers_scaled by 1,000,000'], axis=1)
y = youtube_prepped['remainder__subscribers_scaled by 1,000,000']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)



#Step 5: Training Machine Learning Algorithms

1) Linear Regression

In [None]:
# Training the model with Linear Regression
lr_model = LinearRegression()

lr_model.fit(X_train,y_train)

In [None]:
# Testing the model on test set and reporting Mean Absolute Error
lr_y_predict = lr_model.predict(X_test)

lr_mae = mae(y_test, lr_y_predict)

print(lr_mae)

2) Lasso Regression

In [413]:
# Training the model with Lasso Regression

LassoRegression = Lasso(alpha=1)
lasso_model = LassoRegression.fit(X_train, y_train)

In [None]:
# Testing the model on test set and reporting Mean Absolute Error
Lasso_y_predict = lasso_model.predict(X_test)

lasso_mae = mae(y_test, Lasso_y_predict)

print(f'Lasso Regression MAE: {lasso_mae}')

3) Polynomial with Ridge Regularization

In [415]:
# add new features according to the Polynomial degree
poly = PolynomialFeatures(degree= 2)
X_train_trans = poly.fit_transform(X_train)
X_test_trans = poly.transform(X_test)

In [416]:
# train the ridge model with the new features

RidgeRegression = Ridge(alpha=1)
ridge_model = RidgeRegression.fit(X_train_trans, y_train)


In [None]:
# Testing the model on test set and reporting Mean Absolute Error

ridge_scores = cross_val_score(ridge_model, X_train_trans, y_train, cv=5, scoring='neg_mean_absolute_error')

mean_score = -scores.mean()  # Take the negative value to get the mean squared error

print(f'Cross-Validation Mean Score: {mean_score}')



#6 Graphs for the Best performing algorithms


In [None]:
lr_y_predict = lr_model.predict(X_test)  # X is your feature data
plt.scatter(lr_y_predict, y_test)  # y is your actual target values
plt.xlabel("Subscribers Predicted Values")
plt.ylabel("Subscribers Actual Values")
plt.title("Predicted vs. Actual Values")
plt.show()