# YouTube Mini Project

***Library used :***

    - Pandas
    
***Variables reference :***

    - df : pandas dataframe created from loading 'youtube.csv' file
    - bycategory : df grouped by video category
    - byrating : df sorted by ratings in descending order
    - ind :  a lis containing values for top 10 ratings
    
    
***Steps taken :*** 

    - Deleted redundant rows from 'youtube.csv' file
    - Labeled identifiable columns with proper names
    - loaded the resultant csv into a dataframe df

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('youtube.csv') ## loading csv data into dataframe

In [3]:
df

Unnamed: 0,user,1,category,videos_uploaded,views,rating,4,5
0,EvilSquirrelPictures,1135.0,Pets & Animals,252.0,1075.0,4.96,46.0,86.0
1,hggh22,1135.0,Comedy,169.0,228.0,5.00,5.0,3.0
2,TimeGem,1135.0,Entertainment,95.0,356.0,4.31,13.0,1.0
3,wooochacha,1135.0,Entertainment,118.0,1115.0,2.23,57.0,73.0
4,johnx113,1135.0,Entertainment,83.0,281.0,2.67,9.0,16.0
...,...,...,...,...,...,...,...,...
4095,RSMFD,863.0,Entertainment,63.0,116929.0,4.84,182.0,222.0
4096,RockstarGames,0.0,UNA,63.0,199361.0,4.79,1256.0,2410.0
4097,GTASite,1024.0,Entertainment,123.0,486752.0,4.86,1098.0,1764.0
4098,scrambledeggsTV,1100.0,Entertainment,136.0,13115.0,4.45,33.0,37.0


In [4]:
df.user.isna().sum() ##counting total number of NaN values in 'user' column

34

In [5]:
df = df[df.user.notna()] ## filtering out columns that are not NULL

In [6]:
df = df.reset_index() ## resetting dataframe indexes

In [7]:
df.drop('index', axis=1, inplace=True) ## dropping redundant columns

In [8]:
df.head()

Unnamed: 0,user,1,category,videos_uploaded,views,rating,4,5
0,EvilSquirrelPictures,1135.0,Pets & Animals,252.0,1075.0,4.96,46.0,86.0
1,hggh22,1135.0,Comedy,169.0,228.0,5.0,5.0,3.0
2,TimeGem,1135.0,Entertainment,95.0,356.0,4.31,13.0,1.0
3,wooochacha,1135.0,Entertainment,118.0,1115.0,2.23,57.0,73.0
4,johnx113,1135.0,Entertainment,83.0,281.0,2.67,9.0,16.0


In [9]:
df.dtypes

user                object
1                  float64
category            object
videos_uploaded    float64
views              float64
rating             float64
4                  float64
5                  float64
dtype: object

### Q1 ) Find out the top 5 categories with maximum number of videos uploaded.

In [10]:
a = df.groupby(by='category').groups

In [11]:
ans_dict = {}
for k,v in a.items():
    ans_dict[k] = len(v)

In [12]:
bycategory = pd.DataFrame(ans_dict.items(),columns=['category', 'videos_uploaded'])

In [13]:
bycategory.sort_values(by='videos_uploaded', ascending=False).head(5)

Unnamed: 0,category,videos_uploaded
4,Entertainment,908
7,Music,862
2,Comedy,414
10,People & Blogs,398
8,News & Politics,333


### Q2) Top 10 rated videos

In [14]:
byrating = df.sort_values(by='rating', ascending=False)
byrating

Unnamed: 0,user,1,category,videos_uploaded,views,rating,4,5
1414,RemixedKingz,1022.0,Music,157.0,1537.0,5.0,4.0,3.0
3093,athletic46,1076.0,Sports,55.0,235.0,5.0,2.0,1.0
3056,theevang1,1112.0,People & Blogs,234.0,192.0,5.0,3.0,0.0
3055,theevang1,1115.0,Film & Animation,70.0,1407.0,5.0,9.0,14.0
3054,theevang1,1107.0,Music,512.0,1445.0,5.0,11.0,1.0
...,...,...,...,...,...,...,...,...
3416,NFBCmedia,1023.0,Entertainment,1068.0,67.0,0.0,0.0,0.0
3417,NFBCmedia,1023.0,Entertainment,1099.0,42.0,0.0,0.0,0.0
3419,NFBCmedia,1031.0,Entertainment,819.0,52.0,0.0,0.0,0.0
3420,NFBCmedia,1017.0,Entertainment,1002.0,29.0,0.0,0.0,0.0


In [15]:
b = byrating.groupby(by='rating').groups

In [20]:
ind = sorted(b.keys(),reverse=True)[:11] ## top 10 ratings 
ind

[5.0, 4.99, 4.98, 4.97, 4.96, 4.95, 4.94, 4.93, 4.92, 4.91, 4.9]

In [17]:
byrating[byrating['rating'].isin(ind)]

Unnamed: 0,user,1,category,videos_uploaded,views,rating,4,5
1414,RemixedKingz,1022.0,Music,157.0,1537.0,5.0,4.0,3.0
3093,athletic46,1076.0,Sports,55.0,235.0,5.0,2.0,1.0
3056,theevang1,1112.0,People & Blogs,234.0,192.0,5.0,3.0,0.0
3055,theevang1,1115.0,Film & Animation,70.0,1407.0,5.0,9.0,14.0
3054,theevang1,1107.0,Music,512.0,1445.0,5.0,11.0,1.0
...,...,...,...,...,...,...,...,...
1900,rhymingwithoranges,1082.0,Entertainment,270.0,1720.0,4.9,106.0,84.0
2559,thermal1,1092.0,Education,804.0,1644.0,4.9,10.0,0.0
801,boydism08,1114.0,Music,247.0,1902.0,4.9,10.0,19.0
3012,1atsuchan24,921.0,Film & Animation,303.0,42561.0,4.9,68.0,16.0


### Q3) Most viewed videos

In [18]:
df

Unnamed: 0,user,1,category,videos_uploaded,views,rating,4,5
0,EvilSquirrelPictures,1135.0,Pets & Animals,252.0,1075.0,4.96,46.0,86.0
1,hggh22,1135.0,Comedy,169.0,228.0,5.00,5.0,3.0
2,TimeGem,1135.0,Entertainment,95.0,356.0,4.31,13.0,1.0
3,wooochacha,1135.0,Entertainment,118.0,1115.0,2.23,57.0,73.0
4,johnx113,1135.0,Entertainment,83.0,281.0,2.67,9.0,16.0
...,...,...,...,...,...,...,...,...
4061,RSMFD,863.0,Entertainment,63.0,116929.0,4.84,182.0,222.0
4062,RockstarGames,0.0,UNA,63.0,199361.0,4.79,1256.0,2410.0
4063,GTASite,1024.0,Entertainment,123.0,486752.0,4.86,1098.0,1764.0
4064,scrambledeggsTV,1100.0,Entertainment,136.0,13115.0,4.45,33.0,37.0


In [19]:
df[['user', 'category', 'views']][df['views'] == max(df['views'])]

Unnamed: 0,user,category,views
1091,kaejane,Film & Animation,65341925.0
