In [1]:
#importing libraries
import csv
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import KMeans
from matplotlib import colors as mcolors 
import math 
from sklearn.decomposition import PCA 
from mpl_toolkits.mplot3d import Axes3D 
from matplotlib import cm
import warnings
warnings.filterwarnings('ignore')

In [2]:
#reading user-rating data file 
movies_rating = pd.read_csv('./data/u1.data', sep='\t')
movies_rating['movieid']=movies_rating['movieid'].astype(int)
 
movies_rating.head()

Unnamed: 0,userid,movieid,rating,timestamp,gender,occupation,agerange
0,196,242,3,881250949,M,farmer,1
1,186,302,3,891717742,M,farmer,18
2,22,377,1,878887116,F,sales/marketing,1
3,244,51,2,880606923,M,lawyer,25
4,166,346,1,886397596,M,artist,50


In [3]:
#getting average rating of each movie
avgRatingsMovies = movies_rating.groupby('movieid').mean()
avgRatingsMovies.drop(['userid','timestamp','agerange'], axis=1, inplace=True)
avgRatingsMovies.head()

Unnamed: 0_level_0,rating
movieid,Unnamed: 1_level_1
1,3.878319
2,3.206107
3,3.033333
4,3.550239
5,3.302326


In [None]:
#plotting graph of each movie rating
sns.set(rc={'figure.figsize':(100,20)})
ax = sns.barplot(x=avgRatingsMovies.index, y=avgRatingsMovies.rating)
ax.set(xlabel='Movies ID', ylabel='Avg. Rating', title='All Movies Average Rating')

[Text(0, 0.5, 'Avg. Rating'),
 Text(0.5, 0, 'Movies ID'),
 Text(0.5, 1.0, 'All Movies Average Rating')]

In [None]:
#reading movies data file 
movies =[]
with open('./data/u.item', 'r') as f:
    reader = csv.reader(f, dialect='excel', delimiter='|')
    for row in reader:
        movies.append(row)
        
movies_df = pd.DataFrame(movies,columns =['movieid','movietitle','releasedate','videoreleasedate','IMDbURL','unknown','Action','Adventure','Animation','Childrens','Comedy','Crime','Documentary','Drama','Fantasy','FilmNoir','Horror','Musical','Mystery','Romance','SciFi','Thriller','War','Western'])

display(movies_df.head())

In [None]:
#converting releasedate column into date-time format
movies_df['releasedate']= pd.to_datetime(movies_df['releasedate'])

#converting below columns into type int
movies_df = movies_df.astype({'movieid':int, 'Action':int,'Adventure':int,'Animation':int,'Childrens':int,'Comedy':int,'Crime':int,'Documentary':int,'Drama':int,'Fantasy':int,'FilmNoir':int,'Horror':int,'Musical':int,'Mystery':int,'Romance':int,'SciFi':int,'Thriller':int,'War':int,'Western':int})

#getting sum of movie by genre
Action = sum(movies_df['Action'])
Adventure = sum(movies_df['Adventure'])
Animation = sum(movies_df['Animation'])
Childrens = sum(movies_df['Childrens'])
Comedy = sum(movies_df['Comedy'])
Crime = sum(movies_df['Crime'])
Documentary = sum(movies_df['Documentary'])
Drama = sum(movies_df['Drama'])
Fantasy = sum(movies_df['Fantasy'])
FilmNoir = sum(movies_df['FilmNoir'])
Horror = sum(movies_df['Horror'])
Musical = sum(movies_df['Musical'])
Mystery = sum(movies_df['Mystery'])
Romance = sum(movies_df['Romance'])
SciFi = sum(movies_df['SciFi'])
Thriller = sum(movies_df['Thriller'])
War = sum(movies_df['Romance'])
Western = sum(movies_df['Western'])

genre = ['Action','Adventure','Animation','Childrens','Comedy','Crime','Documentary','Drama','Fantasy','FilmNoir','Horror','Musical','Mystery','Romance','SciFi','Thriller','War','Western']
categoryNumber = [Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,Fantasy,FilmNoir,Horror,Musical,Mystery,Romance,SciFi,Thriller,War,Western]

#plotting graph by genre
sns.set(rc={'figure.figsize':(20,10)})
ax = sns.barplot(x=genre, y=categoryNumber)
ax.set(ylabel='Number of Movies', xlabel='Genre', title='Movies List by Genre from 1922 - 1998')

In [None]:
#Merging two data-sets according by movie-id
clusterData = pd.merge(left=movies_rating, right=movies_df, how='left', on='movieid')

display(clusterData)

In [None]:
#grouping dataset by occupation, gender and age-range
grp = clusterData.groupby(['occupation','gender','agerange'])[genre].sum()
display(grp)


In [None]:
'''
showing movie's genre which is mostly rated/viewed by people of which occupation, gnder and age range
Age -Range
	*  1:  "Under 18"
	* 18:  "18-24"
	* 25:  "25-34"
	* 35:  "35-44"
	* 45:  "45-49"
	* 50:  "50-55"
	* 56:  "56+"
'''

maxMovie = grp.max()

mov_genre_occ_gender = []

#Most likely movie by Genre, from occupation, Fender and Age_range

for eachgenre in maxMovie.index:
    r = grp[grp[eachgenre] == grp[eachgenre].max()]
    mov_genre_occ_gender.append([eachgenre,r.index[0][0],r.index[0][1],int(r.index[0][2])])
    
print('\tGenre\t    Occupation\tGender\tAge Range\n')
mov_genre_occ_gender = pd.DataFrame(mov_genre_occ_gender)
print(mov_genre_occ_gender.to_string(index=False, header=False))

In [None]:
#plotting graph by genre
sns.set(rc={'figure.figsize':(20,15)})

ax = sns.barplot(x=genre, y=mov_genre_occ_gender[3])
ax.set(ylabel='Age Range', xlabel='Genre', title='Mostly viewed movies genre by occupation and Age range')
h, l = ax.get_legend_handles_labels()

ax.legend(title='Occupations', loc='upper right', labels=mov_genre_occ_gender[1],markerscale=20)
plt.setp(ax.get_legend().get_texts(), fontsize='22') # for legend text
plt.setp(ax.get_legend().get_title(), fontsize='25') # for legend title


In [None]:
#Options1 for users
'''
suppose a user wants to find out which movie's genre mostly like by a 'Female homemaker of age b/w 35-44)'

Age -Range
	*  1:  "Under 18"
	* 18:  "18-24"
	* 25:  "25-34"
	* 35:  "35-44"
	* 45:  "45-49"
	* 50:  "50-55"
	* 56:  "56+"
    
#The result would be like below 
'''

occupation = 'homemaker'
gender = 'F'
agerange  = '35'

if gender == 'M':
    gen = 'Male'
else:
    gen = 'Female'
    
mov_gen = max(grp.loc[(grp.index.get_level_values('occupation') == occupation) & 
                (grp.index.get_level_values('gender') == gender) & 
                (grp.index.get_level_values('agerange') == agerange)])

print(gen,occupation, 'of age range',agerange,'likes',mov_gen,'movies')

In [None]:
#Options2 for users
'''
suppose a user wants to find out what type of peoples like crime type movies
'''

mov_gen = 'Crime'

r = grp[grp[mov_gen] == grp[mov_gen].max()].index

if r[0][1] == 'M':
    gen = 'Male'
else:
    gen = 'Female'
  
print(mov_gen, 'type movies mostly watched by', gen, r[0][0], 'of age range', r[0][2])


In [None]:
#converting release date into year
movies_df['releasedate']= movies_df['releasedate'].dt.year

#group by year and sum of all movies by year
movies_df = movies_df.groupby(['releasedate'])[genre].sum()

display(movies_df)
#getting number of movies between 1990 and 1998
movies_1990s = movies_df[(movies_df.index >= 1994) & (movies_df.index <= 1998) ]
movies_1990s.plot(kind="bar")

In [None]:
#clustering with Elbow Method

#taking two-dimensional data i.e. year and Drama movie(because No. of drama movie is more)
Data = {'x': movies_df.index,
        'y':  movies_df.Drama}

df = pd.DataFrame(Data,columns=['x','y'])

display(df)
kmeans = KMeans(n_clusters=3).fit(df)
centroids = kmeans.cluster_centers_
display(centroids)

plt.scatter(df['x'], df['y'], c= kmeans.labels_.astype(float), s=50, alpha=0.5)
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50)

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(df)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()


In [None]:
#k-mean
kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=300, n_init=10, random_state=0)
pred_y = kmeans.fit_predict(df)
plt.scatter(df['x'], df['y'])
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red')
plt.show()

In [None]:
'''
Multidimensional dimensional data
'''
clusters = 18
  
kmeans = KMeans(n_clusters = clusters) 
kmeans.fit(movies_df) 
  
display(kmeans.labels_)
pca = PCA(3) 
pca.fit(movies_df) 
  
pca_data = pd.DataFrame(pca.transform(movies_df)) 
  
display(pca_data.head())

   
''' Generating different colors in ascending order  
                                of their hsv values '''
colors = list(zip(*sorted(( 
                    tuple(mcolors.rgb_to_hsv( 
                          mcolors.to_rgba(color)[:3])), name) 
                     for name, color in dict( 
                            mcolors.BASE_COLORS, **mcolors.CSS4_COLORS 
                                                      ).items())))[1] 
   
   
# number of steps to taken generate n(clusters) colors  
skips = math.floor(len(colors[5 : -5])/clusters) 
cluster_colors = colors[5 : -5 : skips] 
   
fig = plt.figure() 
ax = fig.add_subplot(111, projection = '3d') 
ax.scatter(pca_data[0], pca_data[1], pca_data[2],  
           c = list(map(lambda label : cluster_colors[label], 
                                            kmeans.labels_))) 
   
str_labels = list(map(lambda label:'% s' % label, kmeans.labels_)) 
   
list(map(lambda data1, data2, data3, str_label: 
        ax.text(data1, data2, data3, s = str_label, size = 16.5, 
        zorder = 20, color = 'k'), pca_data[0], pca_data[1], 
        pca_data[2], str_labels)) 
   
plt.show() 

In [None]:
# generating correlation data 
df = movies_df.corr() 
df.index = range(0, len(df)) 
df.rename(columns = dict(zip(df.columns, df.index)), inplace = True) 
df = df.astype(object) 
  
''' Generating coordinates with  
corresponding correlation values '''
for i in range(0, len(df)): 
    for j in range(0, len(df)): 
        if i != j: 
            df.iloc[i, j] = (i, j, df.iloc[i, j]) 
        else : 
            df.iloc[i, j] = (i, j, 0) 
  
df_list = [] 
  
# flattening dataframe values 
for sub_list in df.values: 
    df_list.extend(sub_list) 
  
# converting list of tuples into trivariate dataframe 
plot_df = pd.DataFrame(df_list) 
  
fig = plt.figure() 
ax = Axes3D(fig) 
# plotting 3D trisurface plot 
ax.plot_trisurf(plot_df[0], plot_df[1], plot_df[2],  
                    cmap = cm.jet, linewidth = 0.2) 
  
plt.show() 