# Hogwarts School of Data Wizardry
## Session 4: Intro to Data Science Methods!
`Hufflepuff`  |   [GitHub](https://github.tesla.com/EHSS/Hogwarts) | [Documentation](https://confluence.teslamotors.com/display/EHSSST/Hogwarts+School+of+Data+Wizardry)

#################################################################################################################################################
#################################################################################################################################################

## Table of Contents
### [1. Natural Language Processing (NLP)](#1-natural-language-processing)
- [1.1 Text Cleaning](#11-text-cleaning)
- [1.2 Making a Dictionary](#12-creating-a-dictionary-ie-list-of-all-words)
### [2. Logistic Regression Examples](#2-logistic-regression)
- [2.1 Bag of Words](#21-bag-of-words)
### [3. k Nearest Neighbors](#3-knn----k-nearest-neighbors)
### [4. BERT](#4-bert)

#################################################################################################################################################
#################################################################################################################################################

In [1]:
import pandas as pd
import numpy as np
import math
import sklearn as sk

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [3]:
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib notebook
from sklearn.linear_model import LogisticRegression
import sklearn.model_selection
import sklearn.preprocessing as preproc
from sklearn.feature_extraction import text
import pickle
import warnings
warnings.filterwarnings("ignore")

## 1. Natural Language Processing

### 1.1 Text Cleaning

Remove contractions, turn all text to lowercase, and create a vocab list (i.e. dictionary) of all words in the text column of interest.

In [4]:
contractions = { 
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is / how does",
"I'd": "I had / I would",
"I'd've": "I would have",
"I'll": "I shall / I will",
"I'll've": "I shall have / I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

In [5]:
def clean_text(text, remove_stopwords = True):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
  
        
    return text

### 1.2 Creating a dictionary, i.e. list of all words

Now we are creating a dictionary of all words submitted to our dataset. The process to do this is by taking the series `text_column`, then iterating through every list within this series, then every word within each list. 

We then check to see if the word is already in the dictionary , and if it is not, the word is then appended. This process repeats for every row. 

Once words is initialized, we remove any empty strings and alphabetize it. The dictionary is now ready to be used for future natural language processing.

In [13]:
def dictionary_maker(text_column):
    dictionary = []
    for row in text_column:
        for word in row:
            if word not in dictionary:
                dictionary.append(word)
    
    #remove empty strings
    for word in dictionary:
        if word == "":
            dictionary.remove(word)
            
    #sort dictionary
    dictionary.sort()

    print(f"The dictionary contains {str(len(dictionary))} words")
    return dictionary

##############################################################################################################################################

## 2. Logistic Regression

Specify the test size as a decimal (percentage) for the train test split as the `test_size` parameter.

In [9]:
# This creates a train-validation split on our labeled data
from sklearn.model_selection import train_test_split

#FILL ME IN
df = ""
test_frac = ""

def train_test_sets(dataframe, test_size):
    train, val = train_test_split(dataframe, test_size, random_state = 42)
    print("The training set is " + str(len(train))  + " data points")
    print("The validation set is "  + str(len(val)) +  " data points")
    return train, val

train, val = train_test_sets(dataframe=df, test_size=test_frac)

#### 2.1 Bag of Words

In [10]:
def bow(text_column, train_set, test_set):
    '''
    @PARAM
    text_column == string of the column in the dataframe where the text for analysis is
    train_set == set of divided training data
    test_set == set of divided validation data
    '''
    bow_transform = CountVectorizer(tokenizer=lambda doc: doc, ngram_range=[3,3], lowercase=False) 
    X_tr_bow = bow_transform.fit_transform(train_set[text_column])
    X_te_bow = bow_transform.transform(test_set[text_column])

    return X_tr_bow, X_te_bow

##################################################################################################################################################

## 3. kNN -- K-Nearest Neighbors

kNN is an algorithm that uses the norm to find the nearest neighbors.

In this case, the "norm" is the distance between any 2 data points, often computed as the Euclidean distance:
Norm of $ (x_1, y_1) $ and $ (x_2, y_2) =  (y_2 - y_1) / (x_2 - x_1)$

### @PARAM

k is an odd integer:
$ k = 2n + 1 \ \ \ \ \  \forall \ \  n \in N $


#### Q: Why must k be an odd integer?

--

### @RETURN

The classification = norm of neighbors. 

i.e. Imagine k = 9 for a given data point, which we are trying to classify as red or blue. If 5 neighbors are red, 4 neighbors are blue. Mode (most frequently occurring value) is red, thus the item is classified as red. 

In [None]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
def knn(dataframe, text_col, class_col, k, test_split_size):
    '''
    @PARAM
    dataframe == the dataframe of relevance
    text_col == the name of the column that contains text for analysis
    k == the number of neighbors (MUST BE ODD)
    test_split_size == fraction of data split for testing set (float)
    '''
    encoder = preprocessing.LabelEncoder()
    text_encoded = encoder.fit_transform(dataframe.text_col)

    X_train, X_test, y_train, y_test = train_test_split(text_encoded.reshape(-1,1),dataframe.class_col,test_size=test_split_size)
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train,y_train)

    y_pred = model.predict(X_test)
    print('Predicted',y_pred)
    print('Actual data',y_test)

    accuracy = accuracy_score(y_test, y_pred)
    print('Accuracy',accuracy)

##################################################################################################################################################

## 4. BERT
### Bidirectional Encoder Representations from Transformers


BERT was developped by Google for predicting words in sentences. The algorithm has a fairly high accuracy rate. It is not difficult to use once it is understood. 

def `tranfsormer`: the attention mechanism that learns contextual relationships between words in a text.

-- 

#### Training Stratgies. 
1. Masked LM
    - adds a classification layer on top of the encoder output
    - process includes multiplying the output layer by an embedding matrix
    - words are transformed to the vocab dimension
2. Next Sentence Prediction
    - labels 2 given sentences as `IsNext` or `NotNext`
    - 50% of sentences are a pair of subsequent sentences
    - 50% of sentenes are a random corpus sentence pair

--

#### DistilBERT
DistilBERT is knowledge distillation of the BERT algorithm, known as a lite.

Essentially it is a compression technique from a large to smaller model with similar performance.

In [None]:
import tensorflow
import ktrain
from ktrain import text
from tensorflow import keras
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [None]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [14]:
def bert(text_col, classification_col, test_set_size, classification_names):
  '''
  @PARAM
  assuming
  text_col == the dataframe column of text (strings)
  classification_col == the dataframe column of (numeric) classifications, otherwise change .to_numpy() to .tolist()
  test_set_size == how large the test set should be
  classification_names == list of classifications (i.e. ['red', 'blue'])

  @RETURN
  #TODO: FILL IN RETURN VAL
  '''
  x_train, x_val, y_train, y_val = train_test_split(text_col.tolist(), classification_col.to_numpy(), 
                                              shuffle=True, test_size = test_set_size, random_state=12342, stratify=classification_col)
                      

  (x_train_bert, y_train_bert), (x_val_bert, y_val_bert), preproc = text.texts_from_array(x_train=x_train, y_train=y_train, x_test=x_val, y_test=y_val, 
                                                                                    class_names=["0", "1"], 
                                                                                      preprocess_mode="bert", 
                                                                                      lang = "en", 
                                                                                      maxlen=65, 
                                                                                      max_features=35000)
                                          

  model = text.text_classifier('bert', train_data=(x_train_bert, y_train_bert), preproc=preproc)
  learner = ktrain.get_learner(model, train_data=(x_train_bert, y_train_bert), val_data=(x_val_bert, y_val_bert), batch_size=16)

  #finding the best learning rate via plotting
  learner.lr_find(), learner.lr_plot()
  min = "..." #TODO: Automate finding the min
  learner.autofit(min)

  #validate the data
  learner.validate(val_data=(x_val_bert, y_val_bert), class_names=classification_names)

  predictor = ktrain.get_predictor(learner.model, preproc)
  learner.print_layers()

  #TODO
  return
