## Dealing with Intervals using Pandas

In [11]:
# creating intervals
iv = pd.Interval(left = 0, right = 5)
iv

Interval(0, 5, closed='right')

In [12]:
# To create a time interval you can use Timestamps as the bounds
year_2017 = pd.Interval(pd.Timestamp('2017-01-01 00:00:00'),
                        pd.Timestamp('2018-01-01 00:00:00'),
                        closed='left')
year_2017

Interval('2017-01-01', '2018-01-01', closed='left')

## Pandas

In [13]:
import pandas as pd
new_s = pd.Series([10, 20, 30, 40])
put_index = pd.Series([10, 20, 30, 40], index=["a", "b", "c", "d"])

In [14]:
colors = ["red", "orange", "yellow", "white"]
degrees = [30, 40, 50, 60]
numbers = [11, 12, 13, 14]

new_df = pd.DataFrame({"colors": colors, "degrees": degrees, "numbers": numbers}, 
                      columns = ["colors", "degrees", "numbers"], 
                      index=range(1, 5))
new_df

Unnamed: 0,colors,degrees,numbers
1,red,30,11
2,orange,40,12
3,yellow,50,13
4,white,60,14


In [15]:
df_csv = pd.read_csv("input/sample10.csv", low_memory=False)
df_csv.head(6)

Unnamed: 0,No,Cars,numbers,year,price
0,1,Car1,123,2005,5000
1,2,Car2,456,2006,6000
2,3,Car3,789,2007,7000
3,4,Car4,1011,2008,8000
4,5,Car5,1213,2009,9000
5,6,Car6,1415,2010,10000


In [16]:
new_url = "https://en.wikipedia.org/wiki/Academy_Awards"
new_html = pd.read_html(new_url, header=0, index_col=0)[3]
new_html.head(5)

Unnamed: 0,Unnamed: 1,Unnamed: 2
Award,Best Actor,Best Actress
Winner,Anthony Hopkins(The Father),Frances McDormand(Nomadland)
,,
Award,Best Supporting Actor,Best Supporting Actress
Winner,Daniel Kaluuya(Judas and the Black Messiah),Youn Yuh-jung(Minari)


In [17]:
df_csv

Unnamed: 0,No,Cars,numbers,year,price
0,1,Car1,123,2005,5000
1,2,Car2,456,2006,6000
2,3,Car3,789,2007,7000
3,4,Car4,1011,2008,8000
4,5,Car5,1213,2009,9000
5,6,Car6,1415,2010,10000
6,7,Car7,1617,2011,11000
7,8,Car8,1819,2012,12000
8,9,Car9,2011,2013,13000
9,10,Car10,2356,2014,14000


In [18]:
df_csv.describe()

Unnamed: 0,No,numbers,year,price
count,10.0,10.0,10.0,10.0
mean,5.5,1281.0,2009.5,9500.0
std,3.02765,703.167279,3.02765,3027.650354
min,1.0,123.0,2005.0,5000.0
25%,3.25,844.5,2007.25,7250.0
50%,5.5,1314.0,2009.5,9500.0
75%,7.75,1768.5,2011.75,11750.0
max,10.0,2356.0,2014.0,14000.0


In [19]:
df_csv.loc[df_csv["price"].argmax()][["Cars", "price"]]

Cars     Car10
price    14000
Name: 9, dtype: object

In [20]:
df_csv.loc[df_csv["price"].argmin()][["Cars", "price"]]

Cars     Car1
price    5000
Name: 0, dtype: object

### Pandas and Matplot Exercise

#### Import pandas and read in the purchase data file and set it to a DataFrame called ecom.

In [21]:
import numpy as np
import pandas as pd

In [22]:
ecom = pd.read_csv("purchase_data")

FileNotFoundError: [Errno 2] No such file or directory: 'purchase_data'

**Check the head of the DataFrame.**

In [None]:
ecom.head()

#### How many rows and columns are there?

In [None]:
ecom.info()

####  What is the average Purchase Price?

In [None]:
ecom["Purchase Price"].mean()

####  What were the highest and lowest purchase prices? 

In [None]:
ecom["Purchase Price"].max()

In [None]:
ecom["Purchase Price"].min()

####  How many people have English 'en' as their Language of choice on the website?

In [None]:
ecom[ecom["Language"]=="en"].info()

####  How many people have the job title of "Lawyer" ?


In [None]:
ecom[ecom["Job"]== "Lawyer"].info()

#### How many people made the purchase during the AM and how many people made the purchase during PM ?

In [None]:
ecom["AM or PM"].value_counts()

####  What are the 5 most common Job Titles?

In [None]:
ecom["Job"].value_counts().head()

####  Someone made a purchase that came from Lot: "90 WT" , what was the Purchase Price for this transaction?

In [None]:
ecom[ecom["Lot"]=="90 WT"]["Purchase Price"]

####  What is the email of the person with the following Credit Card Number: 4926535242672853

In [None]:
ecom[ecom["Credit Card"]==4926535242672853]["Email"]

####  How many people have American Express as their Credit Card Provider *and* made a purchase above $95 ?

In [None]:
ecom[(ecom['CC Provider']=="American Express") & (ecom['Purchase Price']>95)].count()

####  How many people have a credit card that expires in 2025?

In [None]:
sum(ecom['CC Exp Date'].apply(lambda exp:exp[3:] == "25"))

#### What are the top 5 most popular email providers/hosts (e.g. gmail.com, yahoo.com, etc...)

In [None]:
ecom["Email"].apply(lambda exp:exp.split('@')[1]).value_counts().head()

#### Find out which company got maximum purchases using appropriate graph.

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = ecom.groupby(["Company"])["Purchase Price"].sum()
df
plt.bar(df.head().index,df.head(), color ='green',width = 0.2)
plt.xticks(rotation=10) 
plt.show()

#### Findout which cc provider had most purchases using pie charts.

In [None]:

df1 = ecom.groupby(["CC Provider"])["Purchase Price"].sum()
total = ecom["Purchase Price"].sum()
plt.pie([x/total for x in df1],labels=[x for x in df1.index],autopct='%0.1f') 

plt.title('CC Providers %') 
plt.show()

#### Find the range of purchase price using box plots. And identify outliers if any.

In [None]:
purchase_price = ecom["Purchase Price"]
plt.boxplot([ecom["Purchase Price"]],labels=["Purchase Price"])

plt.ylabel('Price') 
 
plt.title('Analysing Purchase Price') 
plt.show()

#### Create two bar graph side by side one between cc provider field and sum of purchase and other language and sum of purchase price

In [None]:
fig,ax=plt.subplots(nrows=1,ncols=2,figsize=(20,5))
df= ecom.groupby(["CC Provider"])["Purchase Price"].sum()

# Axes 0 
ax[0].bar(df.head().index,df.head(), color ='green',width = 0.2)

# Axes 1
df1 = ecom.groupby(["Language"])["Purchase Price"].sum()
ax[1].bar(df1.head().index,df1.head(), color ='blue',width = 0.2)

plt.show()

#### Creating Series

In [None]:
num = [1,2,3,4,5]
series = pd.Series(num)
series

In [None]:
student = { 'Id': 100, 'Name': 'Albert', 'State': 'Texas','Age': 24}
 
res = pd.Series(student)
print(res)

In [None]:
data = {
    'mango': [2, 4, 5, 1], 
    'grapes': [1, 4, 8, 3],
    'apples': [3, 5, 7, 3]
}
data

#### Creating DataFrames

In [None]:
fruits = pd.DataFrame(data)
fruits

In [None]:
fruits = pd.DataFrame(data, index=['Rakesh', 'Robin', 'Julie', 'Albert'])
fruits

#### Reading from File

##### 1. From csv

In [None]:
df = pd.read_csv('fruits_data.csv')
df

In [None]:
df = pd.read_csv('fruits_data.csv', index_col=0)
df

#### 2. Reading data from JSON

In [None]:
df = pd.read_json('fruits_data.json')
df

#### Saving to a CSV, JSON

In [None]:
df.to_csv('new_purchases.csv')

df.to_json('new_purchases.json')

#### DataFrame operations

In [None]:
movies_df = pd.read_csv("movies_data.csv", index_col="Title")

##### Viewing your data

In [None]:
movies_df.head()

In [None]:
movies_df.head(10)

In [None]:
movies_df.tail()

In [None]:
movies_df.tail(10)

#### Getting info about data

In [None]:
movies_df.info()

In [None]:
movies_df.shape

#### Handling duplicates

In [None]:
movies_df.duplicated()

In [None]:
movies_df.duplicated().sum()

In [None]:
temp_df = movies_df.append(movies_df)

temp_df.shape

In [None]:
temp_df.duplicated().sum()

In [None]:
temp_df = temp_df.drop_duplicates()
temp_df.duplicated().sum()

In [None]:
temp_df.drop_duplicates(inplace=True)

In [None]:
temp_df = movies_df.append(movies_df)

temp_df.drop_duplicates(inplace=True, keep=False)

temp_df.shape

#### Column Cleanup


In [None]:
movies_df.columns

In [None]:
movies_df.rename(columns={ 'Runtime (Minutes)': 'Runtime', 'Revenue (Millions)': 'Revenue_millions'}, inplace=True)


movies_df.columns

In [None]:
movies_df.columns = ['rank', 'genre', 'description', 'director', 'actors', 'year', 'runtime', 
                     'rating', 'votes', 'revenue_millions', 'metascore']


movies_df.columns

In [None]:
movies_df.columns = [col.lower() for col in movies_df]

movies_df.columns

In [None]:
movies_df.columns = [col.lower() for col in movies_df]

movies_df.columns

#### Working with missing values

##### There are two options in dealing with nulls:

##### 1. Get rid of rows or columns with nulls
##### 2. Replace nulls with non-null values, a technique known as imputation


In [None]:
movies_df.isnull()

In [None]:
movies_df.isnull().sum()

#####  Removing null values


In [None]:
movies_df.dropna()

In [None]:
movies_df.dropna(inplace=True)

#### Imputation

In [None]:
revenue = movies_df['revenue_millions']

In [None]:
revenue.head()

In [None]:
revenue_mean = revenue.mean()

revenue_mean

In [None]:
revenue.fillna(revenue_mean, inplace=True)

In [None]:
movies_df.isnull().sum()

#### Understanding your variables

In [None]:
movies_df.describe()

In [None]:
movies_df['genre'].describe()

In [None]:
movies_df['genre'].value_counts().head(10)

#### Relationships between continuous variables

In [None]:
movies_df.corr()

#### DataFrame slicing, selecting, extracting

In [None]:
genre_col = movies_df['genre']

type(genre_col)

In [None]:
genre_col = movies_df[['genre']]

type(genre_col)

In [None]:
subset = movies_df[['genre', 'rating']]

subset.head()

#### Fetching Data by Rows

##### .loc - locates by name
##### .iloc- locates by numerical index

In [None]:
prom = movies_df.loc["Prometheus"]

prom

In [None]:
prom = movies_df.iloc[1]
prom

In [None]:
movie_subset = movies_df.loc['Prometheus':'Sing']

movie_subset = movies_df.iloc[1:4]

movie_subset

#### Conditional selections

In [None]:
condition = (movies_df['director'] == "Ridley Scott")

condition.head()

In [None]:
movies_df[movies_df['director'] == "Ridley Scott"]

In [None]:
movies_df[movies_df['rating'] >= 8.6].head(3)

In [None]:
movies_df[(movies_df['director'] == 'Christopher Nolan') | (movies_df['director'] == 'Ridley Scott')].head()

In [None]:
movies_df[movies_df['director'].isin(['Christopher Nolan', 'Ridley Scott'])].head()

In [None]:
movies_df[
    ((movies_df['year'] >= 2005) & (movies_df['year'] <= 2010))
    & (movies_df['rating'] > 8.0)
    & (movies_df['revenue_millions'] < movies_df['revenue_millions'].quantile(0.25))
]

In [None]:
movies_df.describe()

#### Applying functions

In [None]:
def rating_function(x):
    if x >= 8.0:
        return "good"
    else:
        return "bad"

In [None]:
movies_df["rating_category"] = movies_df["rating"].apply(rating_function)
movies_df.head(2)

In [None]:
movies_df["rating_category"] = movies_df["rating"].apply(lambda x: 'good' if x >= 8.0 else 'bad')

movies_df.head(2)

#### Other Useful functions

In [None]:
movie_by_genre=movies_df.groupby(['genre'])
movie_by_genre.groups

In [None]:
movie_by_genre['revenue_millions'].mean()

In [None]:
import numpy as np
movies_pivot_by_genre = pd.pivot_table(data=movies_df,index='genre',values='revenue_millions',aggfunc=np.mean)
movies_pivot_by_genre