# Getting Start

In [1]:
import pandas as pd

---

## Pandas Basics

### pd.Series( )

In [None]:
basic_dataset = [1,2,3]
basic_mix_dataset = ["A","B","C",1,2,3,{"A1","B1","C1"}]
basic_multicol_dataset = {"name": ["Allan", "Beatriz", "Clara"], "age": [22,21,24]}
basic_dict = {"day_1": 344, "day_2": 423, "day_3": 314}

# pandas.Series(data, index, dtype, name, copy)
    # one-dimensional (column), holding any data type
        # if label isn't specified, values are labed with their index
            # using labels you could access an item referring them, for create use: pd.Series(data, index = [x,y,z])
my_series = pd.Series(basic_dict)
first_day = my_series["day_1"]

### pd.DataFrame( )

In [None]:
basic_dataset = [1,2,3]
basic_mix_dataset = ["A","B","C",1,2,3,{"A1","B1","C1"}]
basic_multicol_dataset = {"name": ["Allan", "Beatriz", "Clara"], "age": [22,21,24]}
basic_dict = {"day_1": 344, "day_2": 423, "day_3": 314}


# pandas.DataFrame(data, index, columns, dtype, copy)
    # two-dimensional (rows, columns) 
my_dataframe = pd.DataFrame(basic_multicol_dataset)
    
    # for access one-more specified row, use: .loc
my_dataframe.loc[1]     # one row, returns pd.Series(column) 
my_dataframe.loc[[0,2]] # multi rows, returns pd.DataFrame (rows, columns), for this you need to put [ [] ]
    
    # using "index", you can name the rows
my_dataframe = pd.DataFrame(basic_multicol_dataset, index=["Firts", "Second", "Third"])

### pd.read_csv( )

In [None]:
# for load data set stored as csv, use:
    # it will load as DataFrame
load_csv = pd.read_csv("data/data.csv") # specify file path

    # use ".to_string()" for print the entire DataFrame, with many rows it only will return first and last 5 rows
        # to check how many rows the system's display, use: "pd.options.display.max_rows" = [60]
print(load_csv.to_string())

### pd.read_json( )

In [None]:
# for load data set stored as json, use:
    # JSON is plain text, but has the format of an object
load_json = pd.read_json("data/data.json")

    # for print the entire, uses ".to_string()"
print(load_json.to_string())

    # JSON have the same structure as Python Dicts, for this, you can load a Python dict directly as .DataFrame()
python_dict = {
  "Duration":{
    "0":60,
    "1":60,
    "2":60,
    "3":45,
    "4":45,
    "5":60
  },
  "Pulse":{
    "0":110,
    "1":117,
    "2":103,
    "3":109,
    "4":117,
    "5":102
  }
}
 
loading_pythondict_directly = pd.DataFrame(python_dict)

### Analyzing DataFrames

In [None]:
# for get a quicky overview of DataFrame, uses .head():
    # this method returns the headers and the specified number of rows, if you dont specify, returns the top 5 rows
load_json.head(5)

    # for vizualize the last N rows, use .tail()
load_json.tail(5)

    # for see more information about the data set, uses: .info()
        # tells how many rows and columns have, with the data type, tells how many Null values are present
load_json.info()

## Cleaning Data

### Cleaning Empty Cells

In [None]:
bad_dataset = pd.read_csv("data/bad_data.csv")

# for remove empty cells, use: .dropna( )
    # by default, this method returns a new DataFrame, not change the original
        # for change the original, uses the argument: "inplace = True"
bad_dataset.info() # before .dropna()

bad_dataset.dropna(inplace=True)

bad_dataset.info() # after .dropna()

# for replace NULL values with a specific one, uses .fillna()
bad_dataset.fillna(130, inplace=True)

# replace with MEAN MEDIAN and MODE
mean = bad_dataset["Calories"].mean()
median = bad_dataset["Calories"].median()
mode = bad_dataset["Calories"].mode
    # after this, just uses .fillna(mean, inplace = )

### Cleaning Wrong Format

In [None]:
# the column "Date" has two cells with wrong format ["NaN", "20201226"]
    # for fix his, pandas has: to_datetime(), if have problemas, uses format = "mixed" as arg
bad_dataset = pd.read_csv('data/bad_data.csv')

bad_dataset["Date"] = pd.to_datetime(bad_dataset['Date'], format = "mixed")

print(bad_dataset.to_string())

# the NULL value still there, for remove empty cells from a specific column, uses: .dropna(subset["COLUMN"])
bad_dataset.dropna(subset=["Date"], inplace=True)

### Cleaning Wrong Data

In [None]:
# in row 7, the duration is 450, but for all the other rows the duration is between 30 and 60

dataframe = pd.read_csv("data/bad_data.csv")

# .loc [{position}, "{Column}"] = {value}
dataframe.loc[7, "Duration"] = 45 

# for big data sets, you can use a loop
for x in dataframe.index:
    if dataframe.loc[x, "Duration"] > 120:
        dataframe.drop(x, inplace=True)

### Removing Duplacates

In [None]:
duplicate_data = pd.read_csv("data/bad_data.csv")

# returns a Bool values for each row:
print(duplicate_data.duplicated())

# for remove uses:
duplicate_data.drop_duplicates(inplace=True)

print(duplicate_data.duplicated())

## Correlations

### Pandas Correlations

In [None]:
df = pd.read_csv("data/data.csv")

# corr() method calculates the relationship between each column in your data set, this method ignores "not numeric"
df.corr()

# 1 means that there is a 1 to 1 relationship (a perfect correlation)

# 0.9 is also a good relationship, and if you increase one value, the other will probably increase as well.

# -0.9 would be just as good relationship as 0.9, but if you increase one value, the other will probably go down.

# 0.2 means NOT a good relationship, meaning that if one value goes up does not mean that the other will.

## Plotting

### Pandas Plotting

In [2]:
# importing library to visualize the diagram
import matplotlib.pyplot as plt
    # standard visualization
df = pd.read_csv("data/data.csv")
df.plot()

In [9]:
# scatter plot
    # using arg .plot( kind= "scatter" ) 
        # and include x & y axis
df.plot( kind= "scatter", x="Duration", y="Calories")

# we learned that the correlation between "Duration" and "Calories"
    # looking at the scatterplot, we will agree.

In [10]:
# histogram
    # histogram just need one column
        # for this you need to plot directly on the df column
df["Duration"].plot(kind="hist", xlabel= "Duration")

# this graph tells that there were over 100 workouts that lasted between 50 and 60 minutes