<h1 style="color:blue"> Data Wrangling </h1>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
%matplotlib inline

<h2 'style=color:red'> Reading Data from PDF Files </h2>

In [None]:
import PyPDF2

In [None]:
content = " "

In [None]:
with open("RIL.pdf",'rb') as f:
    pdfreader = PyPDF2.PdfFileReader(f)
    pages = pdfreader.numPages
    for page in range(pages):
        content=content+pdfreader.getPage(page).extractText()


In [None]:
print(content)

<h2 'style=color:red'> Reading Datetime Objects </h2>

In [None]:
from datetime import datetime
x="30-06-2020"
x_date = datetime.strptime(x,"%d-%m-%Y")

In [None]:
x_date.strftime("%B %Y")

<h2 'style=color:red'> Checking Datatype Errors </h2>

In [None]:
df = pd.read_csv("Toy Data.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.Revenue.str.replace("Rs.","").str.replace(" crores","").astype("float64").head(2)

In [None]:
df.Revenue.str.replace("Rs.","").str.replace(" crores","").astype("float64").head(2)

In [None]:
df.Revenue = df.Revenue.str.replace("Rs.","").str.replace(" crores","").astype("float64")

In [None]:
df.ROE = df.ROE.str.replace("%","").astype("float64").div(100)

In [None]:
df.ROE.head(3)

In [None]:
df.Profit = df.Profit.str.replace(" crores","").astype("float64")
df.Profit.head(2)

In [None]:
df.info()

In [None]:
df.Date = pd.to_datetime(df.Date,format="%d%m%Y")
df.Date.head(3)

## Exercise: Clean the Future 500.csv Dataset

<h2 'style=color:red'> Clean Variable Names </h2>

In [None]:
import janitor

In [None]:
df = pd.read_csv("janitor.csv")

In [None]:
df

In [None]:
df = df.clean_names()
df

In [None]:
df=df.rename_column("margin%","margin_percent")
df

<h2 'style=color:red'> Managing Missing Data </h2>

In [None]:
pip install missingno

In [None]:
import missingno as msno

In [None]:
df = pd.read_csv("missing toy data.csv")

In [None]:
msno.matrix(df) 

In [None]:
msno.bar(df) 

In [None]:
msno.heatmap(df) 

In [None]:
df.dropna(axis=0,inplace=True)
df

In [None]:
df

In [None]:
df['Stock Price'].fillna(df['Stock Price'].mean(),inplace=True)
df

In [None]:
import datetime

In [None]:
dates = pd.date_range(start=datetime.date(2021,1,1),periods=5,freq='D')
prices=[100,102,np.nan,103,102]

In [None]:
df = pd.DataFrame(data=prices,index=dates,columns=['Prices'])
df

In [None]:
df.Prices.fillna(method='ffill',inplace=True)
df

<h2 'style=color:red'> Wide and Long Form Data </h2>

In [None]:
companies = np.repeat(['A','B','C','D','E'],5)
years = [2019,2020,2021,2022,2023]*5

In [None]:
np.random.seed(13)
assets = np.random.randint(low=100,high=200,size=25)
profit = assets*np.random.randint(low=1,high=100)/100

In [None]:
df = pd.DataFrame([companies,years,assets,profit]).T
df.columns = ['companies',"years","assets","profit"]

In [None]:
df.iloc[[0,1,23,24],:]

In [None]:
df_wide = df.pivot(index='companies',columns='years')
df_wide

In [None]:
df_wide.stack().reset_index().head()