In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#import the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from unidecode import unidecode
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score


In [None]:
#read the csv file 
df = pd.read_csv('/kaggle/input/imdb-india-movies/IMDb Movies India.csv', encoding = 'ISO-8859-1')
df.head()

In [None]:
#will show no. of rows and columns
df.shape

In [None]:
df.info()

In [None]:
#to check the null values
df.isnull().sum()

In [None]:
df.dropna(axis = 0, inplace= True)

In [None]:
#we can see that there is no null values in the dataset.
df.isna().sum()

In [None]:
#delete all the duplicate rows using drop_duplicates()
df.drop_duplicates()

In [None]:
#return boolean value = False means there is no duplicate value 
df.duplicated().any()

In [None]:
df.shape

In [None]:
df.describe(include = 'all')

In [None]:
df['Votes'] = df['Votes'].str.replace(',', '').astype(int) # Replace ',' with an empty string
df['Year'] = df['Year'].str.strip('()').astype(int)  # Remove parentheses and their contents
df['Duration'] = df['Duration'].str.replace('min', '').astype(int) # Replace 'min' with an empty string


In [None]:
# Remove specified characters ('#', '(', ')') from 'text_column'
df['Name'] = df['Name'].str.replace(r'[#()\[\].?:@]', '', regex=True)


In [None]:
df.head(10)


In [None]:
genres=df['Genre'].str.split(',',expand=True)
genres.head(5)

In [None]:
from collections import Counter

# Flatten the 'genres' array to create a single list
flat_genres = genres.values.flatten()

# Use Counter to count occurrences of each genre
genre_counts = Counter(genre for genre in flat_genres if genre is not None)

# Sort the genre counts by genre name (optional)
genre_counts = dict(sorted(genre_counts.items()))

# Print the genre counts
for genre, count in genre_counts.items():
    print(f"{genre}: {count}")


In [None]:
Actor_name = pd.concat([df['Actor 1'], df['Actor 2'], df['Actor 3']]).dropna().value_counts()
Actor_name.head(5)

In [None]:
df.columns

**DATA VISUALIZATION**

In [None]:
# Set the style and font
plt.style.use("seaborn")
plt.rc('font', family='Times New Roman')

# Create a line plot of annual movie release counts
year_counts = df['Year'].value_counts().sort_index()
plt.plot(year_counts.index, year_counts.values, marker='o', linestyle='-')

# Customize the plot
plt.title("Movie Release Over Year")
plt.xlabel("Years")
plt.ylabel("No. of Movies")
plt.xticks(rotation=90)

# Define tick positions
tick_positions = range(min(df['Year']), max(df['Year']) + 1, 5)
plt.xticks(tick_positions, tick_positions)

# Show the plot
plt.show()


In [None]:
# Create a histogram
sns.histplot(data = df, x = "Rating", bins = 20, kde = True)
plt.hist(df['Rating'], density=True, alpha=0.6, color='b', edgecolor='black')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.title('Movie Rating Distribution')
plt.show()


In [None]:
# Set the style and font
plt.style.use("seaborn")
plt.rc('font', family='Times New Roman')
rating_votes = df.groupby('Rating')['Votes'].sum().reset_index()
sns.lineplot(data=rating_votes, x='Rating', y='Votes', marker='o')

# Customize the plot
plt.title("Votes per Rating")
plt.xlabel("Years")
plt.ylabel("Votes")
plt.xticks(rotation=90)

# Show the plot
plt.show()


Training and Testing Dataset

In [None]:
#assign X and Y
X= df.drop(['Name', 'Director','Rating','Genre','Actor 1', 'Actor 2', 'Actor 3'], axis = 1)
Y = df['Rating']
           

In [None]:
#split the dataset into 80% of training and 20% of testing set.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size= 0.2, random_state = 42)
X.shape, X_test.shape, X_train.shape

In [None]:
X,Y

In [None]:
# Create a pipeline with standard scaling and SGD regression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDRegressor

scaler = ('scaler', StandardScaler())
sgd_reg = ('sgd', SGDRegressor(max_iter=10000, random_state=1000))

# Create the pipeline by combining the steps
pipeline = Pipeline([scaler, sgd_reg])

In [None]:
pipeline.fit(X_train, Y_train)

In [None]:
# Predict ratings on the test set
y_pipe = pipeline.predict(X_test)

In [None]:
# Evaluation Metrics for the Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae_pipe = mean_absolute_error(Y_test, y_pipe)
mse_pipe = mean_squared_error(Y_test, y_pipe)
r2_pipe = r2_score(Y_test, y_pipe)

In [None]:
print("Pipeline Mean Absolute Error:", mae_pipe)
print("Pipeline Mean Squared Error:", mse_pipe)
print("Pipeline R-squared:", r2_pipe)

In [None]:
# Take new user input for prediction
Year = input('Enter the Year : ')
Duration = input('Enter the Duration : ')
Votes = input('Enter the Votes : ')

new_input = pd.DataFrame({'Year': [int(Year)], 'Duration': [int(Duration)], 'Votes': [int(Votes)]})

# Use the trained pipeline to make predictions on the input
predicted_rating = pipeline.predict(new_input)

print("Predicted Rating:", predicted_rating)