In [2]:
import boto3
import io
bucket = 'textgenerationbucket'
key = 'text_data/Reviews.csv'
s3_client = boto3.client('s3')
obj = s3_client.get_object(Bucket=bucket, Key=key)

In [3]:
import pandas as pd
import os
import re
import numpy as np
import torch

In [4]:
df = pd.read_csv(io.BytesIO(obj['Body'].read()))
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [5]:
 # Information About Data
    
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568438 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


In [6]:
# Droping the Unwanted Columns:

# data = df.drop(["Id", "ProductId", "UserId", "ProfileName", "HelpfulnessNumerator", "HelpfulnessDenominator", "Score", "Time", "Summary"], axis=1)
data = df[['Text']]


In [7]:
# Making all the words to lower case:

data["Text"] = [re.sub("[^a-z' ]", "", i.lower()) for i in data["Text"]]
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


Unnamed: 0,Text
0,i have bought several of the vitality canned d...
1,product arrived labeled as jumbo salted peanut...
2,this is a confection that has been around a fe...
3,if you are looking for the secret ingredient i...
4,great taffy at a great price there was a wide...
...,...
568449,great for sesame chickenthis is a good if not ...
568450,i'm disappointed with the flavor the chocolate...
568451,these stars are small so you can give of thos...
568452,these are the best treats for training and rew...


In [8]:
# Printing a sample:

data["Text"][0]

'i have bought several of the vitality canned dog food products and have found them all to be of good quality the product looks more like a stew than a processed meat and it smells better my labrador is finicky and she appreciates this product better than  most'

In [9]:
# Function to create a sequence of length 10 Tokens:
def create_seq(text, seq_len = 10):
    
    sequences = []
    
    #if the number of tokens in text is greater than 5
    if len(text.split()) > seq_len:
        for i in range(seq_len, len(text.split())):
            # Select sequence of tokens
            seq = text.split()[i-seq_len:i+1]
            #add to the list
            sequences.append(" ".join(seq))
        return sequences
    else:
        return[text]

In [10]:
sentence ="i have bought several of the vitality canned dog food products and have found them all to be of good quality the product looks more like a stew than a processed meatand it smells better my labrador is finicky and she appreciates this product better than most."

In [11]:
create_seq(sentence)

['i have bought several of the vitality canned dog food products',
 'have bought several of the vitality canned dog food products and',
 'bought several of the vitality canned dog food products and have',
 'several of the vitality canned dog food products and have found',
 'of the vitality canned dog food products and have found them',
 'the vitality canned dog food products and have found them all',
 'vitality canned dog food products and have found them all to',
 'canned dog food products and have found them all to be',
 'dog food products and have found them all to be of',
 'food products and have found them all to be of good',
 'products and have found them all to be of good quality',
 'and have found them all to be of good quality the',
 'have found them all to be of good quality the product',
 'found them all to be of good quality the product looks',
 'them all to be of good quality the product looks more',
 'all to be of good quality the product looks more like',
 'to be of good

In [12]:

# Creating a list of text:

seq = []
text = data["Text"].values
for i in range(10000):
    seqi = create_seq(text[i])
    seq.extend([s for s in seqi if len(s.split(" ")) == 11])

In [13]:
# Length of the List:

len(seq)

652591

In [14]:
# Printing the Last 10 TEXT in the List:

for i in range(652581,652591):
    print(seq[i])

one of the more expensive places target has the best price
of the more expensive places target has the best price so
the more expensive places target has the best price so for
more expensive places target has the best price so for now
expensive places target has the best price so for now it
places target has the best price so for now it works
target has the best price so for now it works and
has the best price so for now it works and i
the best price so for now it works and i recommend
best price so for now it works and i recommend it


In [15]:
# create inputs and targets (x and y)
x = []
y = []

for s in seq:
      if len(s.split()) == 11:
        x.append(" ".join(s.split()[:-1]))
        y.append(" ".join(s.split()[1:]))

In [16]:
# Printing Last 5 Texts of  x:

for i in range(652581,652591):
    print(x[i])

one of the more expensive places target has the best
of the more expensive places target has the best price
the more expensive places target has the best price so
more expensive places target has the best price so for
expensive places target has the best price so for now
places target has the best price so for now it
target has the best price so for now it works
has the best price so for now it works and
the best price so for now it works and i
best price so for now it works and i recommend


In [17]:
#Printing Last 5 Texts of y:

for i in range(652581,652591):
    print(y[i])

of the more expensive places target has the best price
the more expensive places target has the best price so
more expensive places target has the best price so for
expensive places target has the best price so for now
places target has the best price so for now it
target has the best price so for now it works
has the best price so for now it works and
the best price so for now it works and i
best price so for now it works and i recommend
price so for now it works and i recommend it


In [18]:
# create integer-to-token mapping
int2token = {}
cnt = 0

for w in set(" ".join(seq).split()):
    int2token[cnt] = w
    cnt+= 1

# create token-to-integer mapping
token2int = {t: i for i, t in int2token.items()}

In [19]:
#Creating 2 dictionary that maps token

print(token2int["the"]) # Token-to-Integer

print(int2token[7171])  # Integer-to-Token

19194
youbr


In [20]:
# Storing the dictionarys to S3 in .json format

import json 
dict1 = token2int
dict2 = int2token
s3 = boto3.resource('s3') 
obj1 = s3.Object('textgenerationbucket','inputs/token2int.json')
obj = s3.Object('textgenerationbucket','inputs/int2token.json') 
obj1.put(Body=json.dumps(dict1))
obj.put(Body=json.dumps(dict2))

{'ResponseMetadata': {'RequestId': '6HDNPZJZTD0JB767',
  'HostId': '6UKoRVKBVG+UGzzoN5DpNIS7GVb9Gn+XxxvSSvTF46ESAzdxYTUqowHwm3Vs84LSStBWI/GTv8I=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '6UKoRVKBVG+UGzzoN5DpNIS7GVb9Gn+XxxvSSvTF46ESAzdxYTUqowHwm3Vs84LSStBWI/GTv8I=',
   'x-amz-request-id': '6HDNPZJZTD0JB767',
   'date': 'Sun, 27 Mar 2022 14:47:47 GMT',
   'x-amz-version-id': 'd2dWAed0LA8Rx31WGt5nVPsxrkeKIf5W',
   'etag': '"3b5074540fc3705c69991b5b30a65109"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"3b5074540fc3705c69991b5b30a65109"',
 'VersionId': 'd2dWAed0LA8Rx31WGt5nVPsxrkeKIf5W'}

In [21]:
# set vocabulary size
vocab_size = len(int2token)
vocab_size

24301

In [22]:
# Defining function to convert text sequences to integer sequences:

def get_integer_seq(seq):
    return [token2int[w] for w in seq.split()]

In [23]:
# converting text sequences to integer sequences:

x_int = [get_integer_seq(i) for i in x]
y_int = [get_integer_seq(i) for i in y]

In [24]:
np.array(x_int).shape

(652591, 10)

In [25]:
# x_int = torch.tensor(np.array(x_int))
# y_int = torch.tensor(np.array(y_int))

In [26]:
upload_dir = 'inputs/'
if not os.path.exists(upload_dir): # Make sure that the folder exists
    os.makedirs(upload_dir)

np.save(os.path.join(upload_dir, 'y_int.npy'), y_int)
np.save(os.path.join(upload_dir, 'x_int.npy'), x_int)

In [27]:
import sagemaker
sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = 'torch_data'

input_data = sagemaker_session.upload_data(path=upload_dir, bucket='textgenerationbucket', key_prefix=prefix)

In [28]:
import boto3
s3client = boto3.client("s3")
s3 = boto3.resource('s3')
s3 = boto3.resource(
    service_name='s3',
    region_name='us-east-1',
    aws_access_key_id='AKIASCVPXXOPVBCKOLUF',
    aws_secret_access_key='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'
)

In [29]:
import sagemaker

print(sagemaker.__version__)
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
print(role)

2.77.1
arn:aws:iam::143176219551:role/service-role/AmazonSageMaker-ExecutionRole-20211115T211961


In [None]:
!pygmentize textgeneration.py

In [None]:
from sagemaker import get_execution_role, Session
role=get_execution_role()
role

In [None]:
from sagemaker.pytorch import PyTorch


pytorch_estimator = PyTorch(entry_point='textgeneration.py',
                            role = role,
                            instance_type='',
                            instance_count=1,
                            framework_version='1.6.0',
                            py_version='py36',
                            script_mode=True
                           )

In [None]:
pytorch_estimator.fit()