In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_treebank_pos_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package maxent_treebank_pos_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
# NLP
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Classification
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

# Regression Models
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor

# TensorFlow and Keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.metrics import AUC, BinaryAccuracy
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Testing and optimization
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics.regression import mean_absolute_error, mean_squared_error, r2_score

# import module
from src.pipeline import *

# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt                   
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime                 
from sagemaker.predictor import csv_serializer   
import pickle
import datetime as dt
import glob

# Define IAM role
role = get_execution_role()
prefix = 'sagemaker/DEMO-xgboost-dm'
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'} # each region has its XGBoost container
my_region = boto3.session.Session().region_name # set the region of the instance
print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + containers[my_region] + " container for your SageMaker endpoint.")


Success - the MySageMakerInstance is in the us-east-1 region. You will use the 811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


In [10]:
!pip install wrapt --upgrade --ignore-installed
!pip install tensorflow

Collecting wrapt
Installing collected packages: wrapt
Successfully installed wrapt-1.12.0
[33mYou are using pip version 10.0.1, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Collecting tensorflow
  Using cached https://files.pythonhosted.org/packages/de/f0/96fb2e0412ae9692dbf400e5b04432885f677ad6241c088ccc5fe7724d69/tensorflow-1.14.0-cp36-cp36m-manylinux1_x86_64.whl
Collecting absl-py>=0.7.0 (from tensorflow)
Collecting tensorboard<1.15.0,>=1.14.0 (from tensorflow)
  Using cached https://files.pythonhosted.org/packages/91/2d/2ed263449a078cd9c8a9ba50ebd50123adf1f8cfbea1492f9084169b89d9/tensorboard-1.14.0-py3-none-any.whl
Collecting google-pasta>=0.1.6 (from tensorflow)
  Using cached https://files.pythonhosted.org/packages/c3/fd/1e86bc4837cc9a3a5faf3db9b1854aa04ad35b5f381f9648fbe81a6f94e4/google_pasta-0.1.8-py3-none-any.whl
Collecting astor>=0.6.0 (from tensorflow)
  Using cached https://files.pythonhosted.org/packag

In [11]:
pip install --upgrade pip


The following command must be run outside of the IPython shell:

    $ pip install --upgrade pip

The Python package manager (pip) can only be used from outside of IPython.
Please reissue the `pip` command in a separate terminal or command prompt.

See the Python documentation for more information on how to install packages:

    https://docs.python.org/3/installing/


In [2]:
s3 = boto3.resource('s3')
bucket = 'fakenewscorpus'
key = 'data/2M_df.pkl'
# obj = s3.get_object(Bucket='bucket', Key='key')
df = pickle.loads(s3.Bucket(bucket).Object(key).get()['Body'].read())

In [3]:
def balance(df):
    n_pos = len(df[df['label']==1])
    n_neg = len(df[df['label']==0])
    return n_pos, n_neg


In [4]:
n_pos, n_neg = balance(df)

In [5]:
n_pos/len(df)

0.33273061724603664

In [6]:
sw = pd.read_csv('data/sw1k.csv')['term'].to_numpy()

In [7]:
sample = df.sample(10000,axis=0)

In [8]:
balance(sample)

(3226, 6774)

In [9]:
tokens = tokenize(sample['content'],sw)

In [10]:
bow, tf, tfidf, cv, tv = vectorize(tokens,max_features=5000,ngram=1)

In [15]:
pd.DataFrame(tfidf).to_csv('data/sample_tfidf.csv')

In [16]:
X_train = tfidf

In [17]:
y_train = sample['label']

In [15]:
MAX_SEQ_LENGHT =  max(X_train, key=len).shape[1]
MAX_SEQ_LENGHT 

5000

In [None]:
N_FEATURES = len(vectorizer.get_feature_names())
X_train_sequences = pad_sequences(X_train_sequences, maxlen=MAX_SEQ_LENGHT, value=N_FEATURES)

In [23]:
model = Sequential()
model.add(Embedding(X_train.shape[1] + 1,
                    256  # Embedding size
                   ))
                    
model.add(LSTM(256),)
model.add(Dense(units=1, activation='sigmoid'))
 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['AUC','BinaryAccuracy'])
print(model.summary())

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [24]:
!mkdir -p saved_model
model.save('saved_model/sample_model')

In [None]:
model.fit(X_train[:-2000], y_train[:-2000], 
          epochs=3, batch_size=512, verbose=1,
          validation_data=(X_train[-2000:], y_train[-2000:]))

Train on 8000 samples, validate on 2000 samples
Epoch 1/3


In [1]:
!git status

On branch master
Your branch is ahead of 'origin/master' by 1 commit.
  (use "git push" to publish your local commits)

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git checkout -- <file>..." to discard changes in working directory)

	[31mmodified:   .ipynb_checkpoints/Untitled-Copy1-checkpoint.ipynb[m
	[31mmodified:   .ipynb_checkpoints/Untitled-checkpoint.ipynb[m

no changes added to commit (use "git add" and/or "git commit -a")


In [2]:
!git reset HEAD^ --soft

In [3]:
!git status

On branch master
Your branch is up-to-date with 'origin/master'.

Changes to be committed:
  (use "git reset HEAD <file>..." to unstage)

	[32mnew file:   .ipynb_checkpoints/Untitled-Copy1-checkpoint.ipynb[m
	[32mmodified:   .ipynb_checkpoints/Untitled-checkpoint.ipynb[m
	[32mnew file:   Untitled-Copy1.ipynb[m
	[32mmodified:   Untitled.ipynb[m
	[32mnew file:   data/sample_tfidf.csv[m
	[32mnew file:   saved_model/sample_model[m

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git checkout -- <file>..." to discard changes in working directory)

	[31mmodified:   .ipynb_checkpoints/Untitled-Copy1-checkpoint.ipynb[m
	[31mmodified:   .ipynb_checkpoints/Untitled-checkpoint.ipynb[m
	[31mmodified:   Untitled.ipynb[m



In [4]:
!git reset

Unstaged changes after reset:
M	.ipynb_checkpoints/Untitled-checkpoint.ipynb
M	Untitled.ipynb


In [5]:
!git status

On branch master
Your branch is up-to-date with 'origin/master'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git checkout -- <file>..." to discard changes in working directory)

	[31mmodified:   .ipynb_checkpoints/Untitled-checkpoint.ipynb[m
	[31mmodified:   Untitled.ipynb[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31m.ipynb_checkpoints/Untitled-Copy1-checkpoint.ipynb[m
	[31mUntitled-Copy1.ipynb[m
	[31mdata/sample_tfidf.csv[m
	[31msaved_model/[m

no changes added to commit (use "git add" and/or "git commit -a")
