In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn as sk
import statsmodels.api as sm
import seaborn as sns


# Load Data

In [None]:
from pathlib import Path
import os

#Load training and test data into separate dataframes
pwd = os.getcwd()
root = Path('input', '[SUBFOLDER]')
file = '[FILENAME].csv'
df = pd.read_csv(root / file)

#Examine dataframe to see ensure data is loaded correctly
df.head()

# Structure Investigation

### Get Number of Rows/Cols

In [None]:
print(df.shape)
print("Rows:", df.shape[0])
print("Cols:", df.shape[1])

### Check Datatypes
Get datatypes with non-null count for all columns

In [None]:
df.info()

Get number of datatypes

In [None]:
pd.value_counts(df.dtypes)

### Structure of Numerical Features
Number of entries, number of unique entries, mean, std dev, minimum, 25 percentile, 50 percentile, 75 percentile, maximum value

In [None]:
df.describe(include="number")

### Plot Numerical Features
Plot number of unique values for numerical entries with log y plot. Binary features (2 unique values), ordinal features (2-10 unique values), continous features (10+ unique values)

In [None]:
unique_values = df.select_dtypes(include="number").nunique().sort_values()
unique_values.plot.bar(logy=True)

### Structure of Non-numerical Features
Number of entries, number of unique entries, most common entry, frequency of most common entry

In [None]:
df.describe(exclude="number")

### Strcture of All Features

In [None]:
df.describe(include='all')

### Data Selection/Filtering

#### Get Specific Row(s)

In [None]:
df.loc[df['COLUMN'] == VALUE]

#### Get Specific Column(s)

In [None]:
df[['COLUMN1','COLUMN2']]

#### Get List of Column Names

In [None]:
list(df.columns)[0:x]

#### Slicing
Step from x to y (exclusive) in steps of z, z by default is 1

In [None]:
df[x:y:z]

#### Selecting Data
`.loc`: select rows/columns using labels
`.iloc`: select rows/columns using integer indices

In [None]:
df.loc[['RowLabel1', 'RowLabel2']]
df.loc[['RowLabel1', 'RowLabel2'],['ColLabel1', 'ColLabel2']]

#### Aggregate Data (Sum/Mean/Mode)
Perform Operation `OPERATION` on column `COLUMN2`, within column `COLUMN1`

In [None]:
df.groupby(['COLUMN1']).agg({'COLUMN2':'OPERATION'})
#OPERATIONS: sum, max, min, mean, pass function

# Data Quality Investigation

### Get Number of Duplicates

In [None]:
df.duplicated().sum()

### Check Duplicates
View duplicate entries as dataframe

In [None]:
df[df.duplicated(keep=False)]

### Get Number of Missing Values

In [None]:
df.isna().sum()

In [None]:
df.isna().values.sum()

### Check Missing Values

In [None]:
df[df.isna().any(axis=1)]

### Plot Missing Values

In [None]:
sns.heatmap(df.isnull(), cbar=False).set_title("[TITLE]")

Alternate (By column number only, sample numbers, good for large dataset)

In [None]:
plt.imshow(df.isna(), aspect="auto", interpolation="nearest", cmap="binary")

### Remove Missing Values
Remove samples missing 20% or more of values

In [None]:
df = df.dropna(thresh=df.shape[1] * 0.80, axis=0).reset_index(drop=True)

Remove columns missing 20% or more of samples

In [None]:
df = df.dropna(thresh=df.shape[0] * 0.85, axis=1)

# Plotting

### Box and Whisker Plot

In [None]:
sns.boxplot(x='[DATA LABEL]', data=df)

### Plot Distribution of a Feature

In [None]:
sns.countplot(x=df['[INDEPENDENT VARIABLE]']);

### Plot All Numerical Features
Plot all numerical features, check for binary/ordinal/continuous features. Setting explanation:
lw=0: No lines
marker=".": Use dot markers for each point
subplots=True: Each feature in separate subplot
layout=(-1, 4): Use as many rows of subplots as needed (-1), use 4 columns of subplots (4)

In [None]:
df.plot(lw=0, marker=".", subplots=True, layout=(-1, 4), figsize=([LENGTH], [WIDTH]), markersize=1);

### Plot Feature Distributions

In [None]:
df.hist(bins=[BINS], layout=(-1, 4),figsize=([LENGTH], [WIDTH]), edgecolor="black")
plt.tight_layout();

### Plot Feature Mode Distribution
For identifying features with a singluar category

In [None]:
frequency = df.mode()
df_freq = df.eq(frequency.values, axis=1)
df_freq = df_freq.mean().sort_values(ascending=False)
#df_freq.head()
df_freq.plot.bar(figsize=([LENGTH], [WIDTH]));

### Plot Numerical Features (Bivariate)

In [None]:
sns.pairplot(df, hue='[INDEPENDENT VARIABLE]');

### Examine Feature Relationships

In [None]:
df_corr = df.corr(method="pearson")
#labels = np.where(np.abs(df_corr)>0.75, "S", np.where(np.abs(df_corr)>0.5, "M", np.where(np.abs(df_corr)>0.25, "W", "")))
sns.heatmap(df_corr, mask=np.eye(len(df_corr)), square=True, center=0, annot=False, fmt='g', linewidths=.5, cmap="vlag", cbar_kws={"shrink": 0.8});

### Confusion Matrix
May need to use `np.argmax` with `axis=-1`

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=([LENGTH], [WIDTH]))
sns.heatmap(cm, annot=True, fmt='g');
plt.xlabel('Predicted');
plt.ylabel('Actual');

# NLP

### Create Bag of Words

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords

text =  """MULTILINE
          STRING
          BLOCK
        """
        
sentences = nltk.sent_tokenize(text)
for i, sentence in enumerate(sentences):
    words = nltk.word_tokenize(sentence)
    
nltk.download('wordnet')


lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

stop_words = set(stopwords.words("english"))
words = nltk.word_tokenize(sentence)
words_cleaned = [word for word in words if not word in stop_words]

### Break sentences into words

In [None]:
for i, sentence in enumerate(sentences):
    words = nltk.word_tokenize(sentence)
    print(f'{i}: {words}')

# Others

### Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split([X], [y], test_size=0.1, stratify=[y])

### Random Forest Importance Scores

In [None]:
from sklearn.ensemble import RandomForestClassifier
featLabels = []
x_train = df[featLabels] # features/independent variables
y_train = df['[INDEPENDENT VARIABLE]']
clf = RandomForestClassifier(n_estimators=10000)
clf.fit(x_train, y_train)
for feature in zip(featLabels, clf.feature_importances_):
    print(feature)

Alternative method, only list top x features by threshold

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
featLabels = []
x_train = df[featLabels] # features/independent variables
y_train = df['[INDEPENDENT VARIABLE]']
clf = RandomForestClassifier(n_estimators=10000)
clf.fit(x_train, y_train)
featSel = SelectFromModel(clf, threshold=0.15)
featSel.fit(x_train, y_train)
for featIndex in featSel.get_support(indices=True):
    print(featLabels[featIndex])