## Introduction to Pandas Notebook
1. install pandas in your Python environment
1. import pandas as pd

In [5]:
# import pandas
import pandas as pd

In [2]:
# two main datatypes for pandas (Series, DataFrame)
series = pd.Series(["BMW", "Toyota", "Honda"])

In [3]:
series

0       BMW
1    Toyota
2     Honda
dtype: object

In [4]:
# series = 1 dimensional or 1 column

In [5]:
colors = pd.Series(["Red", "Blue", "White"])
colors

0      Red
1     Blue
2    White
dtype: object

In [6]:
# DataFrame = 2 dimensional
car_data = pd.DataFrame({"Make": series, "Color": colors})
car_data

Unnamed: 0,Make,Color
0,BMW,Red
1,Toyota,Blue
2,Honda,White


In [6]:
# Read in car-sales.csv into a Panda DataFrame
car_sales = pd.read_csv("car-sales.csv")
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [8]:
# Export a DataFrame to CSV
car_sales.to_csv("exported-car_sales.csv")

In [9]:
# Read the exported DataFrame Data back to a DataFrame (do we get the same results, what's different, why?)
exported_car_sales = pd.read_csv("exported-car_sales.csv")
exported_car_sales

Unnamed: 0.1,Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,0,Toyota,White,150043,4,"$4,000.00"
1,1,Honda,Red,87899,4,"$5,000.00"
2,2,Toyota,Blue,32549,3,"$7,000.00"
3,3,BMW,Black,11179,5,"$22,000.00"
4,4,Nissan,White,213095,4,"$3,500.00"
5,5,Toyota,Green,99213,4,"$4,500.00"
6,6,Honda,Blue,45698,4,"$7,500.00"
7,7,Honda,Blue,54738,4,"$7,000.00"
8,8,Toyota,White,60000,4,"$6,250.00"
9,9,Nissan,White,31600,4,"$9,700.00"


In [16]:
# The reason we have a new column called Unnamed is we did not include the index_col=False parameter when reading the csv (given it already included an index column)
# again with index_col=False
exported_car_sales_fixed = pd.read_csv("car-sales.csv", index_col=False)
exported_car_sales_fixed

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [11]:
# Importing CSV files into Pandas
csv_file_to_import = pd.read_csv("https://urlpath.to.file/file.csv")

# Note: If pulling from github, make sure the file is in the raw format.

URLError: <urlopen error [Errno 8] nodename nor servname provided, or not known>

In [17]:
# Use Pandas to calculate the mean/average on numerical columns (not string-based columns)
car_sales.mean(numeric_only=True)

Odometer (KM)    78601.4
Doors                4.0
dtype: float64

## Describe (Exploring) Data

In [14]:
# Attribute
car_sales.dtypes

# Function
#car_sales.to_csv()

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price            object
dtype: object

In [18]:
# Get Column Names
car_columns = car_sales.columns
car_columns

Index(['Make', 'Colour', 'Odometer (KM)', 'Doors', 'Price'], dtype='object')

In [19]:
# Get Car Sales Indexes
car_sales.index

RangeIndex(start=0, stop=10, step=1)

In [20]:
# Describe Car Sales (works on numberic columns only)
car_sales.describe()

Unnamed: 0,Odometer (KM),Doors
count,10.0,10.0
mean,78601.4,4.0
std,61983.471735,0.471405
min,11179.0,3.0
25%,35836.25,4.0
50%,57369.0,4.0
75%,96384.5,4.0
max,213095.0,5.0


In [21]:
# Get Car Sales Info
car_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Make           10 non-null     object
 1   Colour         10 non-null     object
 2   Odometer (KM)  10 non-null     int64 
 3   Doors          10 non-null     int64 
 4   Price          10 non-null     object
dtypes: int64(2), object(3)
memory usage: 532.0+ bytes


In [23]:
# Get Car Sales Mean
car_sales.mean(numeric_only=True)

Odometer (KM)    78601.4
Doors                4.0
dtype: float64

In [24]:
# Mean on Series
car_prices = pd.Series([30000, 89000, 110000])
car_prices.mean(numeric_only=True)

76333.33333333333

In [29]:
# Sum on DataFrame
car_sales["Doors"].sum()

40

In [7]:
# Get the Length of the DataFrame
len(car_sales)

10

## Viewing and Selecting Data