<a href="https://colab.research.google.com/github/innocentmatutu/Machine-learning/blob/main/Movie_Ratings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.metrics import  mean_squared_error,mean_absolute_error,r2_score

df = pd.read_csv('/content/archive (10).zip')

#Features and target selection
y = df['averageRating']
df.drop(['averageRating'],axis=1,inplace=True)

#Training,testing and spliting of the data
X_train, X_valid, y_train, y_valid = train_test_split(df, y ,test_size=0.2,random_state=1)

#Select categorical columns with relatively low cardinality
categorical_cols = [col for col in df.columns if df[col].nunique() < 10 and
                    df[col].dtype == 'object']
#Select numerical column
numerical_cols = [col for col in df.columns if df[col].dtype in ['int64','float64']]

#Combine selected columns
my_cols = categorical_cols + numerical_cols
X_train = X_train[my_cols].copy()
X_valid = X_valid[my_cols].copy()

#Preprocessing of numerical columns
numerical_transformer = SimpleImputer(strategy='mean')

#Preprocessing of categorical columns
categorical_transformer = Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown='ignore'))
])

#Bundle preprocessing of numerical and categorical columns
preprocesser = ColumnTransformer(
    transformers = [
        ('num',numerical_transformer,numerical_cols),
        ('cat',categorical_transformer,categorical_cols)
    ]
)

#Modle selection
model = xgb.XGBRegressor(n_estimators=500,max_depth=5)

#Bundle preprocessing and model into pipeline
my_pipeline = Pipeline(steps=[
    ('preporcesser',preprocesser),
    ('model',model)
])

#Fit the model
my_pipeline.fit(X_train,y_train)

#Prediction of the model
prediction = my_pipeline.predict(X_valid)

#Metrics
print(f'Mean squared error: {mean_squared_error(y_valid,prediction):4f}')
print(f'Mean absolute error: {mean_absolute_error(y_valid,prediction):4f}')
print(f'r2 score: {r2_score(y_valid,prediction):4f}')

Mean squared error: 0.001903
Mean absolute error: 0.025300
r2 score: 0.970217
