# Pandas Tutorial

In [1]:
# Import pandas library and assign pd as an alias, so instead of using the complete name 'pandas' e.g. pandas.head() we can use pd.head()
# pandas internally uses numpy for all its calculations and storage.
import pandas as pd

In [17]:
d = {'tens':[10,20,30,40], 'hundreds': [100,200,300,400], 'thousands':[1000,2000,3000,4000]}
df = pd.DataFrame(data=d)
df

Unnamed: 0,tens,hundreds,thousands
0,10,100,1000
1,20,200,2000
2,30,300,3000
3,40,400,4000


In [18]:
df.index

RangeIndex(start=0, stop=4, step=1)

In [19]:
df.index=['a','b','c','d']

In [21]:
df

Unnamed: 0,tens,hundreds,thousands
a,10,100,1000
b,20,200,2000
c,30,300,3000
d,40,400,4000


In [20]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [2]:
#Read the ratings data from file 'u.data' and set the column index names
ratings = pd.read_csv('ml-100k\\u.data', sep='\t', names= ['user_id', 'movie_id', 'rating', 'timestamp'])
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [7]:
# Check if the dataframe has any null values
# .isna() function will check the dataframe for any Null or NaN values and set the cell to True. 
# .isna() and .isnull() function similarly
ratings.isna()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
99995,False,False,False,False
99996,False,False,False,False
99997,False,False,False,False
99998,False,False,False,False


In [13]:
# once we mark the cells to True and False based on the cell value whether it is null or not. it would be difficult to 
# read the entire DataFrame(DF) and check which cell value is true. The solution to this is we can use pd.value_counts function
# the value_count function checks for common rows and returns the count of the rows. 
ratings.isna().apply(pd.value_counts)

Unnamed: 0,user_id,movie_id,rating,timestamp
False,100000,100000,100000,100000


In [14]:
# we can sort the data by user_id using the .sort_values method. .head() would print out the first 5 rows
ratings.sort_values('user_id', ascending=True).head()

Unnamed: 0,user_id,movie_id,rating,timestamp
66567,1,55,5,875072688
62820,1,203,4,878542231
10207,1,183,5,875072262
9971,1,150,5,876892196
22496,1,68,4,875072688


In [15]:
# we can also sort the dataframe by multiple columns
ratings.sort_values(['user_id', 'movie_id'], ascending=True).head()

Unnamed: 0,user_id,movie_id,rating,timestamp
32236,1,1,5,874965758
23171,1,2,3,876893171
83307,1,3,4,878542960
62631,1,4,3,876893119
47638,1,5,3,889751712


In [None]:
movies = pd.read_csv('ml-100k\\u.item', sep='|', names=['movie_id', 'movie_title', 'release_date', 'video_release_date', 'imdb_url', 'unknown', 'action', 'adventure', 'animation', 'children', 'comedy','crime', 'documentary', 'drama', 'fantasy', 'film_noir', 'horror', 'musical',  'mystery', 'romance','scifi', 'thriller', 'war', 'western'], encoding='latin-1')

In [None]:
movies.head()

In [None]:
users = pd.read_csv('ml-100k\\u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])

In [None]:
users.head()