In [None]:
import numpy as np
import pandas as pd

### Creating a pandas dataframe

In [None]:
house_prices_df = pd.read_csv('../data/house-prices/train.csv')

In [None]:
melb_data = pd.read_csv('../data/melbourne_housing_snapshot/melb_data.csv')

Taking a look at the dataframe.

In [None]:
house_prices_df.head()

In [None]:
melb_data.head()

In [None]:
house_prices_df.dtypes

### Descibing the dataframes.

In [None]:
house_prices_df.describe()

### Describe a particular column

In [None]:
house_prices_df['MSSubClass'].describe()

### Get the dataframe shape and columns

In [None]:
house_prices_df.shape

In [None]:
house_prices_df.columns

### Changing the column names of the dataframes

In [None]:
d = [[1, "a", "foo", 3.0]]
df = pd.DataFrame(data=d)

In [None]:
df

In [None]:
df.columns = ['id', 'x1', 'x2', 'x3']

In [None]:
df

In [None]:
columns_map = {"id": "Id", "SalePrice": "SalePriceDollars"}

changed_cols_df = house_prices_df.rename(index=str, 
                                         columns=columns_map)

In [None]:
changed_cols_df.head()

### Unique values in a column

In [None]:
house_prices_df['MSSubClass'].unique()

In [None]:
len(house_prices_df['MSSubClass'].unique())

### Value Counts

In [None]:
house_prices_df["MSSubClass"].value_counts()

### Order by and group by

In [None]:
df_agg = house_prices_df.groupby(['MSSubClass'])

In [None]:
(df_agg
    .size()
    .sort_values(ascending=False))

### Filtering data

In [None]:
house_prices_df[house_prices_df["MSSubClass"] == 20].head()

### Membership in dataframe

In [None]:
house_prices_df["MSSubClass"].isin(["20","60"])

In [None]:
validMembership = house_prices_df[house_prices_df["MSSubClass"].isin(["20","60"])]

In [None]:
validMembership.shape

#### Concatenating to the dataframe.

In [None]:
arr = np.arange(12).reshape((3, 4))

In [None]:
arr

In [None]:
np.concatenate([arr, arr], axis=1)

### Handling missing data.

For various reasons, many real world datasets contain missing values, often encoded as blanks, NaNs or other placeholders. Such datasets however are incompatible with scikit-learn estimators which assume that all values in an array are numerical, and that all have and hold meaning. A basic strategy to use incomplete datasets is to discard entire rows and/or columns containing missing values. However, this comes at the price of losing data which may be valuable (even though incomplete). A better strategy is to impute the missing values, i.e., to infer them from the known part of the data.

In [None]:
house_prices_df.head()

Many will recognise the fillna method

In [None]:
melb_data['YearBuilt_new'] = melb_data['YearBuilt']
melb_data['YearBuilt_new'] = melb_data['YearBuilt_new'].fillna(1964)
melb_data.head()

Another way is using the scikit-learmn imputer. The advantage of the imputer class is that the imputer class will not actually compute the matrix and you can plug it in a sklearn pipeline in case of a large matrix.

In [None]:
features_in_focus = ['BuildingArea', 'YearBuilt']
melb_data = melb_data[features_in_focus]

In [None]:
melb_data.head()

In [None]:
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer(missing_values=np.nan,
                           strategy='mean')
imputed_melb_data = my_imputer.fit_transform(melb_data)

In [None]:
imputed_melb_data[0]

### Discretization and Binning

In [None]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins, right=False)
cats.codes

### Getting a particular data

In [None]:
house_prices_df.loc[0, "MSSubClass"]

In [None]:
house_prices_df["MSSubClass"][0]

### Sorting

In [None]:
house_prices_df.sort_values("MSSubClass",
                            ascending=False).head()

### Reshaping and Pivoting

In [None]:
data = [["memories","book","q1",10],
        ["dreams","book","q2",20], 
        ["reflections","book","q3",30],
        ["how to build a house","book","q4",40],
        ["wonderful life","music","q1",10], 
        ["million miles","music","q2",20],
        ["run away","music","q3",30],
        ["mind and body","music","q4",40]]

df_products = pd.DataFrame(data)
df_products.columns = ["product","category","quarter","profit"]
df_products.head()

In [None]:
df_products.pivot_table(index="category",
                        columns=["quarter"],
                        values=["profit"],
                        aggfunc=np.sum)

### Merges and Joins

In [None]:
llist = pd.DataFrame([["bob", "2015-01-13", 4],
                      ["alice", "2015-04-23",10]])
llist.columns = ["name","date","duration"]
right = pd.DataFrame([["alice", 100],
                      ["bob", 23]])
right.columns = ["name", "upload"]

In [None]:
df = pd.merge(llist, right,
              on=['name', 'name'])
df.head()

In [None]:
llist.append(right, sort=True)

### Working on time series data

In [None]:
crypto_df = pd.read_csv("../data/crypto_data/LTC-USD.csv",
                        names=["time", "low", "high", "open", "close", "volume"])

FUTURE_PERIOD_PREDICT = 3
crypto_df["future"] = crypto_df["close"].shift(-FUTURE_PERIOD_PREDICT)

crypto_df.head()

In [None]:
crypto_df.tail()

### Function application, transformations and mapping

In [None]:
def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

crypto_df["target"] = crypto_df[["close", "future"]].apply(lambda x: classify(*x), axis=1)

crypto_df.head()

In [None]:
house_prices_df["SalePrice"] = house_prices_df["SalePrice"].apply(float)

In [None]:
house_prices_df.head()

### How to do machine learning. An example using sklearn.

In [None]:
bike_sharing_df = pd.read_csv("../data/bike-sharing/hour.csv")

featureCols = ["season", "yr", "mnth", "hr", 
               "holiday", "weekday", "workingday",
               "weathersit", "temp", "atemp",
               "hum", "windspeed"]
output_col = "cnt"

bike_sharing_df_X = bike_sharing_df[featureCols]
bike_sharing_df_y = bike_sharing_df[output_col]

In [None]:
bike_sharing_df.head()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    bike_sharing_df_X, bike_sharing_df_y, test_size=0.33, random_state=42)

lm = LinearRegression()
model = lm.fit(X_train, y_train)