# Data Cleaning with Pandas — Avoid this Mistake!

Source code from Medium's article ["Data Cleaning with Pandas — Avoid this Mistake!"](https://towardsdatascience.com/data-cleaning-with-pandas-avoid-this-mistake-7af559657c2c) written by [Sarah Eade](https://towardsdatascience.com/@seade03).

# Import libraries

In [1]:
import pandas as pd

# What are Mixed Type Columns

In [2]:
df = pd.DataFrame({"price": [9.99, "9.99"]})

print(list(map(type, df["price"])))


[<class 'float'>, <class 'str'>]


In [3]:
df.dtypes

price    object
dtype: object

In [5]:
int_series = pd.Series([1, 2, 3])
print(int_series.dtype)    # dtype("int64")

int_series.loc[3] = "4"
print(int_series.dtype)      # dtype("O")

int64
object


# Where this Causes Problems

In [6]:
sales_data = pd.read_csv("./data/sales_data.csv", parse_dates=["date"])
census_data = pd.read_csv("./data/population_by_zip_2010.csv", )

population = census_data.groupby("zipcode")["population"].sum().reset_index()

In [7]:
sales_data.sample(3, random_state=1)

Unnamed: 0,date,units_sold,address_line,city,state,zip_code
559,2018-06-18 17:14:37,0,615 E 89TH,CA,USA,90002-1635
347,2019-04-12 19:27:48,8,8800 WADSWORTH,CA,USA,
396,2019-07-24 07:45:57,2,9914 S BROADWAY,CA,USA,90003-4173


In [8]:
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 594 entries, 0 to 593
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          594 non-null    datetime64[ns]
 1   units_sold    594 non-null    int64         
 2   address_line  594 non-null    object        
 3   city          594 non-null    object        
 4   state         594 non-null    object        
 5   zip_code      521 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 28.0+ KB


In [9]:
census_data.head(3)

Unnamed: 0,population,minimum_age,maximum_age,gender,zipcode,geo_id
0,50,30.0,34.0,female,61747,8600000US61747
1,5,85.0,,male,64120,8600000US64120
2,1389,30.0,34.0,male,95117,8600000US95117


In [10]:
population.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33119 entries, 0 to 33118
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   zipcode     33119 non-null  int64
 1   population  33119 non-null  int64
dtypes: int64(2)
memory usage: 517.6 KB


In [11]:
population.head()

Unnamed: 0,zipcode,population
0,602,124560
1,603,164067
2,606,19845
3,610,87048
4,612,201030


In [13]:
sales_data["zip_code"] = sales_data["zip_code"].str[:5]

sales_data.loc[(sales_data["zip_code"].isna()) & (sales_data["date"] < pd.to_datetime("2018-01-01")),
               "zip_code"] = 90001
               
sales_data.loc[(sales_data["zip_code"].isna()) & (sales_data["date"] >= pd.to_datetime("2018-01-01")),
               "zip_code"] = 90002               

In [14]:
sales_and_population = pd.merge(sales_data, population, left_on="zip_code", right_on="zipcode")

In [15]:
sales_and_population.sample(3)

Unnamed: 0,date,units_sold,address_line,city,state,zip_code,zipcode,population
52,2017-02-03 03:33:13,-3,400 W 68TH,CA,USA,90001,90001,171330
34,2019-09-29 23:19:56,7,6122 S MAIN,CA,USA,90002,90002,153669
49,2017-03-17 19:30:36,16,8800 GRAHAM,CA,USA,90001,90001,171330


In [16]:
sales_and_population.shape

(73, 8)

In [17]:
print(set(sales_data["zip_code"].apply(lambda x: type(x))))

{<class 'int'>, <class 'str'>}


In [18]:
sales_data = sales_data.astype({"zip_code": int})

sales_and_population = pd.merge(sales_data, population, left_on="zip_code", right_on="zipcode")

In [19]:
sales_and_population.shape

(594, 8)