# Spring Cleaning!

Harold's stock data is a mess! Help him clean up his data before the auditors arrive!

In [5]:
# Initial imports
import pandas as pd
from pathlib import Path

### Load CSV data into Pandas using `read_csv`

In [27]:
stocks = pd.read_csv("../Resources/stock_data.csv")

### Identify the number of rows and columns (shape) in the DataFrame.

In [28]:
stocks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504 entries, 0 to 503
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   symbol              504 non-null    object 
 1   name                502 non-null    object 
 2   sector              501 non-null    object 
 3   price               500 non-null    object 
 4   price_per_earnings  497 non-null    float64
 5   dividend_yield      499 non-null    float64
 6   earnings_per_share  498 non-null    object 
 7   52_week_low         500 non-null    float64
 8   52_week_high        500 non-null    float64
 9   market_cap          500 non-null    float64
 10  ebitda              492 non-null    float64
 11  price_per_sales     500 non-null    float64
 12  price_per_book      492 non-null    float64
 13  sec_filings         500 non-null    object 
dtypes: float64(8), object(6)
memory usage: 55.2+ KB


### Preview the DataFrame using `head` to visually ensure data has been loaded in correctly.

In [29]:
stocks.head()

Unnamed: 0,symbol,name,sector,price,price_per_earnings,dividend_yield,earnings_per_share,52_week_low,52_week_high,market_cap,ebitda,price_per_sales,price_per_book,sec_filings
0,MMM,3M Company,Industrials,$222.89,24.31,2.332862,$7.92,259.77,175.49,138721100000.0,9048000000.0,4.390271,11.34,http://www.sec.gov/cgi-bin/browse-edgar?action...
1,AOS,A.O. Smith Corp,Industrials,,,,,,,,,,,
2,ABT,Abbott Laboratories,Health Care,56.27,22.51,1.908982,0.26,64.6,42.28,102121000000.0,5744000000.0,3.74048,3.19,http://www.sec.gov/cgi-bin/browse-edgar?action...
3,ABBV,AbbVie Inc.,Health Care,108.48,19.41,2.49956,3.29,125.86,60.05,181386300000.0,10310000000.0,6.291571,26.14,http://www.sec.gov/cgi-bin/browse-edgar?action...
4,ATVI,Activision Blizzard,Information Technology,65.83,,0.431903,1.28,74.945,38.93,52518670000.0,2704000000.0,10.59512,5.16,http://www.sec.gov/cgi-bin/browse-edgar?action...


### Identify the number of records in the DataFrame, and compare it with the number of rows in the original file.

In [52]:
records = stocks.shape[0]
ogrecords = len(stocks.index)
ogrecords

504

### Identify null records

In [31]:
stocks.isnull()

Unnamed: 0,symbol,name,sector,price,price_per_earnings,dividend_yield,earnings_per_share,52_week_low,52_week_high,market_cap,ebitda,price_per_sales,price_per_book,sec_filings
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,True,True,True,True,True,True,True,True,True,True,True
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499,False,False,False,False,False,False,False,False,False,False,False,False,False,False
500,False,False,False,False,False,False,False,False,False,False,False,False,False,False
501,False,False,False,False,False,False,False,False,False,False,False,False,False,False
502,False,False,False,False,False,False,False,False,False,False,False,False,False,False


### Drop Null Records

In [32]:
stocks_nona = stocks.dropna().copy()

### Validate nulls have been dropped

In [40]:
stocks_nona.isnull().sum()
# stocks.isnull().sum()

symbol                0
name                  0
sector                0
price                 0
price_per_earnings    0
dividend_yield        0
earnings_per_share    0
52_week_low           0
52_week_high          0
market_cap            0
ebitda                0
price_per_sales       0
price_per_book        0
sec_filings           0
dtype: int64

### Default null `ebitda` values to 0. Then, validate no records are null for `ebitda`.

In [65]:
stocks_ebitda = stocks.copy()
stocks_ebitda['ebitda'] = stocks['ebitda'].fillna(0)
# stocks_ebitda
stocks_ebitda.isnull().sum()

symbol                 0
name                   2
sector                 3
price                  4
price_per_earnings     7
dividend_yield         5
earnings_per_share     6
52_week_low            4
52_week_high           4
market_cap             4
ebitda                 0
price_per_sales        4
price_per_book        12
sec_filings            4
dtype: int64

### Drop Duplicates

In [69]:
stocks
stocks['name'].duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
499    False
500    False
501    False
502    False
503    False
Name: name, Length: 504, dtype: bool

---

### Challenge

#### Preview price field using the head function.

#### Clean `price` Series by replacing `$`

#### Confirm data type of `price`

#### Cast `price` Series as float and then validate using `dtype`