In [1]:
# https://www.ritchieng.com/pandas-removing-duplicate-rows/
import pandas as pd
# read a dataset of movie reviewers (modifying the default parameter values for read_table)
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
url = 'http://bit.ly/movieusers'
users = pd.read_table(url, sep='|', header=None, names=user_cols, index_col='user_id')
users.head()

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [2]:
users.shape

(943, 4)

In [3]:
# use df.cat_name.duplicated()
# output True if row above is the same
users.zip_code.duplicated()

user_id
1      False
2      False
3      False
4      False
5      False
       ...  
939    False
940     True
941    False
942    False
943    False
Name: zip_code, Length: 943, dtype: bool

In [4]:
# type
type(users.zip_code.duplicated())

pandas.core.series.Series

In [5]:
# we can use .count() since it's a series
# there're 148 duplicates
users.zip_code.duplicated().sum()

148

In [6]:
# it will output True if entire row is duplicated (row above)
users.duplicated()

user_id
1      False
2      False
3      False
4      False
5      False
       ...  
939    False
940    False
941    False
942    False
943    False
Length: 943, dtype: bool

In [7]:
# examine duplicated rows
users.loc[users.duplicated(), :]

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
496,21,F,student,55414
572,51,M,educator,20003
621,17,M,student,60402
684,28,M,student,55414
733,44,F,other,60630
805,27,F,other,20009
890,32,M,student,97301


In [8]:
# keep='first'
# mark duplicates as True except for the first occurence
users.loc[users.duplicated(keep='first'), :]

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
496,21,F,student,55414
572,51,M,educator,20003
621,17,M,student,60402
684,28,M,student,55414
733,44,F,other,60630
805,27,F,other,20009
890,32,M,student,97301


In [9]:
# keep='last'
# 7 rows that are counted as duplicates, keeping the later one

# this is useful for splitting the data
users.loc[users.duplicated(keep='last'), :]

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
67,17,M,student,60402
85,51,M,educator,20003
198,21,F,student,55414
350,32,M,student,97301
428,28,M,student,55414
437,27,F,other,20009
460,44,F,other,60630


In [10]:
# mark all duplicates as True
# this combines the two tables above
users.loc[users.duplicated(keep=False), :]

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
67,17,M,student,60402
85,51,M,educator,20003
198,21,F,student,55414
350,32,M,student,97301
428,28,M,student,55414
437,27,F,other,20009
460,44,F,other,60630
496,21,F,student,55414
572,51,M,educator,20003
621,17,M,student,60402


In [11]:
# drops the 7 rows
users.drop_duplicates(keep='first').shape

(936, 4)

In [12]:
# drops the last version of the 7 duplicate rows
users.drop_duplicates(keep='last').shape

(936, 4)

In [13]:
# drops all 14 rows
users.drop_duplicates(keep=False).shape

(929, 4)

In [14]:
# only consider "age" and "zip_code" as the relevant columns
users.duplicated(subset=['age', 'zip_code']).sum()

16

In [15]:
# https://www.ritchieng.com/pandas-filtering-converting-series-NaN/
import pandas as pd
url = 'http://bit.ly/imdbratings'
movies = pd.read_csv(url)
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [16]:
# counting missing values
movies.content_rating.isnull().sum()

3

In [17]:
movies.loc[movies.content_rating.isnull(), :]

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
187,8.2,Butch Cassidy and the Sundance Kid,,Biography,110,"[u'Paul Newman', u'Robert Redford', u'Katharin..."
649,7.7,Where Eagles Dare,,Action,158,"[u'Richard Burton', u'Clint Eastwood', u'Mary ..."
936,7.4,True Grit,,Adventure,128,"[u'John Wayne', u'Kim Darby', u'Glen Campbell']"


In [18]:
# counting content_rating unique values
# you can see there're 65 'NOT RATED' and 3 'NaN'
# we want to combine all to make 68 NaN
movies.content_rating.value_counts(dropna=False)

R            460
PG-13        189
PG           123
NOT RATED     65
APPROVED      47
UNRATED       38
G             32
PASSED         7
NC-17          7
X              4
GP             3
NaN            3
TV-MA          1
Name: content_rating, dtype: int64

In [19]:
# examining content_rating's 'NOT RATED'
movies.loc[movies.content_rating=='NOT RATED', :]

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."
6,8.9,"The Good, the Bad and the Ugly",NOT RATED,Western,161,"[u'Clint Eastwood', u'Eli Wallach', u'Lee Van ..."
41,8.5,Sunset Blvd.,NOT RATED,Drama,110,"[u'William Holden', u'Gloria Swanson', u'Erich..."
63,8.4,M,NOT RATED,Crime,99,"[u'Peter Lorre', u'Ellen Widmann', u'Inge Land..."
66,8.4,Munna Bhai M.B.B.S.,NOT RATED,Comedy,156,"[u'Sunil Dutt', u'Sanjay Dutt', u'Arshad Warsi']"
...,...,...,...,...,...,...
665,7.7,Lolita,NOT RATED,Drama,152,"[u'James Mason', u'Shelley Winters', u'Sue Lyon']"
673,7.7,Blow-Up,NOT RATED,Drama,111,"[u'David Hemmings', u'Vanessa Redgrave', u'Sar..."
763,7.6,Hunger,NOT RATED,Biography,96,"[u'Stuart Graham', u'Laine Megaw', u'Brian Mil..."
827,7.5,The Wind That Shakes the Barley,NOT RATED,Drama,127,"[u'Cillian Murphy', u'Padraic Delaney', u'Liam..."


In [20]:
# filtering only 1 column
movies.loc[movies.content_rating=='NOT RATED', 'content_rating']

5      NOT RATED
6      NOT RATED
41     NOT RATED
63     NOT RATED
66     NOT RATED
         ...    
665    NOT RATED
673    NOT RATED
763    NOT RATED
827    NOT RATED
899    NOT RATED
Name: content_rating, Length: 65, dtype: object

In [22]:
import numpy as np
type(movies.loc[movies.content_rating=='NOT RATED', 'content_rating'])

# there's no error here
# however, if you use other methods of slicing, it would output an error

# equating this series to np.nan converts all to 'NaN'
movies.loc[movies.content_rating=='NOT RATED', 'content_rating'] = np.nan
# it has changed from 65 to 68
movies.content_rating.isnull().sum()

68

In [24]:
# select top_movies
top_movies = movies.loc[movies.star_rating >= 9, :]
top_movies

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."


In [25]:
# there's a SettingWithCopyWarning here because Pandas is not sure if the DataFrame is a view or copy
top_movies.loc[0, 'duration'] = 150

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [26]:
# to get rid of the error, always use .copy()

top_movies = movies.loc[movies.star_rating >= 9, :].copy()
top_movies.loc[0, 'duration'] = 150

In [27]:
top_movies

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,150,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
