In [1]:
import pandas as pd
import numpy as np

## 27. How do I avoid a SettingWithCopyWarning in pandas? ([video](https://www.youtube.com/watch?v=4R4WsDJ-KVc&list=PL5-da3qGB5ICCsgW1MxlZ0Hq8LL5U3u9y&index=27))

In [2]:
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('http://bit.ly/imdbratings')
movies.head(10)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."
6,8.9,"The Good, the Bad and the Ugly",NOT RATED,Western,161,"[u'Clint Eastwood', u'Eli Wallach', u'Lee Van ..."
7,8.9,The Lord of the Rings: The Return of the King,PG-13,Adventure,201,"[u'Elijah Wood', u'Viggo Mortensen', u'Ian McK..."
8,8.9,Schindler's List,R,Biography,195,"[u'Liam Neeson', u'Ralph Fiennes', u'Ben Kings..."
9,8.9,Fight Club,R,Drama,139,"[u'Brad Pitt', u'Edward Norton', u'Helena Bonh..."


In [3]:
movies.content_rating.replace(to_replace='NOT RATED', value=np.nan, inplace=True)

In [4]:
movies.head(10)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."
5,8.9,12 Angry Men,,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."
6,8.9,"The Good, the Bad and the Ugly",,Western,161,"[u'Clint Eastwood', u'Eli Wallach', u'Lee Van ..."
7,8.9,The Lord of the Rings: The Return of the King,PG-13,Adventure,201,"[u'Elijah Wood', u'Viggo Mortensen', u'Ian McK..."
8,8.9,Schindler's List,R,Biography,195,"[u'Liam Neeson', u'Ralph Fiennes', u'Ben Kings..."
9,8.9,Fight Club,R,Drama,139,"[u'Brad Pitt', u'Edward Norton', u'Helena Bonh..."


In [5]:
movies.content_rating.isnull().sum()

68

In [7]:
# create a DataFrame only containing movies with a high 'star_rating'
top_movies = movies.loc[movies.star_rating >= 9, :]
top_movies

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."


Is there a specific reason you don't use to .replace method to replace the values within a data frame? I personally don't use it and am wanting to know if there is a preferred way, .loc or .replace. I obviously don't know what issues that may arise considering I have been doing it wrong the WHOLE time using: 

    movies[movies.content_rating=='NOT RATED'].content_rating = np.nan 
 OR 
 
    movies[movies.content_rating=='NOT RATED']['content_rating'] = np.nan
    
Using .replace seems to avoid the problem of getting the SettingWithCopyWarning warning. 


    
The reason I got into this bad habbit is because of my heavy use of nested dictionaries. For exmaple, if I have a dictionary named 'models' with key names that represent the several models I use for my spectral fitting ('sbpl', 'pl', 'bbody'), and then a nested dictionry for each with their respective parameters (i.e., 'alpha' is a param of the 'sbpl' model), I will use the following quite often:

    models['sbpl']['alpha']
    models['sbpl']['beta']
    models['pl']['plIndex']
    models['bbody']['kT']
    
I use this to both retrieive and set the values of the model parameters. My question is, why doesn't pandas data frames use this same notation? If I update a value using:
    
    models['sbpl']['alpha'] = np.nan
    
I never have a problem writing it. 



to retrieve specific model parameter values (alpha, beta, plIndex, kT) from their respective models (sbpl, sbpl, pl, bbody). If you use dictionaries often, its easy to start using this notation in pandas. 
Is there not a way to change pandas functionality so that the get and set nature of dictionaries is the same in pandas? 



In [32]:
models = OrderedDict()

models['sbpl']   = {'alpha': -1.25, 'beta': -2.61, 'epeak': 524.1, 'norm': 0.001728}
models['pl']     = {'plIndex': -1.25, 'norm': 0.001728}
models['bbody']  = {'kT': 35.7, 'norm': 0.001728}



In [33]:
models['sbpl']

{'alpha': -1.25, 'beta': -2.61, 'epeak': 524.1, 'norm': 0.001728}

In [34]:
models['sbpl']['alpha']

-1.25

In [35]:
models['sbpl']['alpha'] = np.nan

In [36]:
models['sbpl']['alpha']

nan

In [17]:
from collections import OrderedDict

TOP_MOVIES = OrderedDict()

for key in top_movies.keys():
    TOP_MOVIES[key] = top_movies[key].tolist()

In [25]:
print(top_movies['title'][0])
print(TOP_MOVIES['title'][0])

The Shawshank Redemption
The Shawshank Redemption


In [19]:
TOP_MOVIES.keys()

odict_keys(['star_rating', 'title', 'content_rating', 'genre', 'duration', 'actors_list'])

In [20]:
top_movies.keys()

Index(['star_rating', 'title', 'content_rating', 'genre', 'duration',
       'actors_list'],
      dtype='object')

In [21]:
TOP_MOVIES['genre']

['Crime', 'Crime', 'Crime', 'Action']

In [22]:
top_movies['genre']

0     Crime
1     Crime
2     Crime
3    Action
Name: genre, dtype: object

In [None]:
# create a DataFrame only containing movies with a high 'star_rating'
top_movies1 = movies.loc[movies.star_rating >= 9, :]
top_movies1

In [4]:
# create a DataFrame only containing movies with a high 'star_rating'
top_movies2 = movies[movies.star_rating >= 9]
top_movies2

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."


In [5]:
# overwrite the relevant cell with the correct duration
top_movies1.loc[0, 'duration'] = 150

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [6]:
# overwrite the relevant cell with the correct duration
top_movies2.loc[0, 'duration'] = 150

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [7]:
top_movies1

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,150,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."


In [8]:
top_movies2

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,150,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."


In [9]:
movies2 = movies.copy()

# finally, replace the 'NOT RATED' values with 'NaN' (imported from NumPy)
movies2[movies2.content_rating=='NOT RATED'].content_rating = np.nan

In [12]:
movies2.content_rating.isnull().sum()

3

In [None]:
movies[movies.content_rating == "NOT RATED"] = np.nan

In [13]:
movies[movies.content_rating=='NOT RATED'].content_rating = np.nan
# OR 
movies[movies.content_rating=='NOT RATED']['content_rating'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [19]:
?movies2.content_rating.replace

In [26]:
movies2.content_rating.replace(to_replace='NOT RATED', value=np.nan, inplace=True)

In [27]:
movies2.content_rating.isnull().sum()

68

In [29]:
movies.content_rating.replace(to_replace='NOT RATED', value=np.nan, inplace=True)

In [None]:
# finally, replace the 'NOT RATED' values with 'NaN' (imported from NumPy)
import numpy as np
movies[movies.content_rating=='NOT RATED'].content_rating = np.nan