In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Load the dataset
data = pd.read_csv("/content/train (1).csv")

# Drop columns with a lot of missing values
data = data.drop(columns=['Alley', 'PoolQC', 'Fence', 'MiscFeature'])

# Define the features and the target
X = data.drop(columns=["SalePrice"])
y = data["SalePrice"]

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Preprocessing for numerical data: impute missing values and scale
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data: impute missing values and encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine the preprocessing steps for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Create the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 886536754.6621429


In [None]:
# Install the surprise library if not already installed
!pip install scikit-surprise

import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split

# Load the datasets
ratings = pd.read_csv("/content/ratings_small.csv")
movies = pd.read_csv("/content/movies_metadata.csv")

# Preprocess the movies dataset to ensure proper merging
movies['id'] = pd.to_numeric(movies['id'], errors='coerce')
movies = movies.dropna(subset=['id'])
movies['id'] = movies['id'].astype(int)

# Load the dataset into Surprise
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Split the dataset into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Use SVD for collaborative filtering
algo = SVD()

# Train the algorithm on the trainset
algo.fit(trainset)

# Function to get top n movie recommendations for a user
def get_top_n_recommendations(user_id, n=10):
    # Get a list of all movie IDs
    all_movie_ids = ratings['movieId'].unique()

    # Get the list of movie IDs the user has already rated
    rated_movie_ids = ratings[ratings['userId'] == user_id]['movieId'].unique()

    # Get the list of movie IDs the user has not rated
    unrated_movie_ids = [movie_id for movie_id in all_movie_ids if movie_id not in rated_movie_ids]

    # Predict ratings for the unrated movies
    predictions = [algo.predict(user_id, movie_id) for movie_id in unrated_movie_ids]

    # Sort the predictions by estimated rating
    predictions.sort(key=lambda x: x.est, reverse=True)

    # Get the top n recommendations
    top_n_recommendations = predictions[:n]

    # Get the movie titles for the top n recommendations
    top_n_movie_titles = []
    for pred in top_n_recommendations:
        movie_title = movies[movies['id'] == pred.iid]['title']
        if not movie_title.empty:
            top_n_movie_titles.append(movie_title.values[0])

    return top_n_movie_titles

# Example usage
user_id = 1
n_recommendations = 10
recommendations = get_top_n_recommendations(user_id, n=n_recommendations)
print(f"Top {n_recommendations} movie recommendations for user {user_id}:")
for i, movie in enumerate(recommendations, 1):
    print(f"{i}. {movie}")