**Downloading necessary packages and importing necessary libraries**

In [1]:
from google.colab import drive
drive.mount('/content/drive')
!pip install -q simpletransformers
import warnings, re
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch import maximum
from simpletransformers.language_modeling import LanguageModelingModel,LanguageModelingArgs
from simpletransformers.language_generation import LanguageGenerationModel, LanguageGenerationArgs

Mounted at /content/drive
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m74.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26

**Reading dataset**

In [2]:
df = pd.read_csv ('/content/drive/MyDrive/next-word-prediction/dataset.csv')
df.head(5)

Unnamed: 0,text
0,Photo by Josh Riemer on Unsplash\n\nMerry Chri...
1,Your Brain On Coronavirus\n\nA guide to the cu...
2,Mind Your Nose\n\nHow smell training can chang...
3,Passionate about the synergy between science a...
4,"You’ve heard of him, haven’t you? Phineas Gage..."


**Text Cleaning Of The Known Mistakes**

In [3]:
df['text'] = df['text'].str.replace('\n\n', ' ')
df['text'] = df['text'].str.replace('  ', ' ')
df['text'] = df['text'].str.replace('\.\)', '.')
df['text'] = df['text'].str.replace('—', ',')
df['text'] = df['text'].str.replace(' ,', ',')

**Thorough Text Cleaning**

In [4]:
# Function to clean text
def clean_text(text):
    # Remove heading
    cleaned_text = re.sub(r'^[^\n]+\n\n', '', text)
    # Remove web links
    cleaned_text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|www\.[a-zA-Z0-9]+\.[a-zA-Z0-9]+(?:[\S]+)?', '', cleaned_text)
    # Split text into sentences
    sentences = re.split(r'(?<=[.!?])\s+', cleaned_text)
    # Filter out sentences with less than 10 words and not containing 'photo by'
    filtered_sentences = [sentence for sentence in sentences if len(re.findall(r'\b\w+\b', sentence)) >= 10 and 'photo by' not in sentence.lower()]
    # Join filtered sentences into a single paragraph
    cleaned_text = ' '.join(filtered_sentences)
    # Remove non-alphabetic characters except necessary punctuations
    cleaned_text = re.sub(r"[^a-zA-Z\s.,'!?]", '', cleaned_text)
    return cleaned_text

# Apply the function to the 'text' column
df['text'] = df['text'].apply(clean_text)

# Remove rows with less than 100 words
df = df[df['text'].apply(lambda x: len(x.split())) >= 100]

# Print the DataFrame with cleaned text
print(df)

                                                   text
0     We just wanted everyone to know how much we ap...
1     It frustrates politicians and public health of...
2     Mind Your Nose How smell training can change y...
4     The railroad worker who survived an explosion ...
5     Mentally, Young Adults Are Suffering Most From...
...                                                 ...
7996  Writing Advice For The Noobs, All of Us Starte...
7998  Characteristics and features of GoogLeNet conf...
7999  Indeed, there is a lot of potential in Machine...
8000  Some of the questions we are hoping to answer ...
8001  Resilience When we heard the word resilience d...

[7597 rows x 1 columns]


**Looking for and dropping duplicates**

In [5]:
print (df.shape)
df = df.drop_duplicates()
print (df.shape)

(7597, 1)
(7596, 1)


**1 duplicate found**

**Resetting index**

In [6]:
df = df.reset_index(drop=True)

**Looking for missing values**

In [7]:
df.isnull().sum()

text    0
dtype: int64

**No missing values found**

**Extracting first 100 words from each row/entry, removing all characters that come after the last full stop and saving the modified dataframe**

In [8]:
text_column = "text"

def extract_first_100_words(text):
    words = text.split()[:100]
    return " ".join(words)

df[text_column] = df[text_column].apply(extract_first_100_words)

text_column = "text"

def remove_words_after_full_stop(text):
    last_full_stop_index = text.rfind(".")
    if last_full_stop_index != -1:
        text = text[:last_full_stop_index + 1]
    return text

df[text_column] = df[text_column].apply(remove_words_after_full_stop)

# Saving the modified dataframe to a new CSV file
df.to_csv("/content/drive/MyDrive/next-word-prediction/next_word_prediction_dataset.csv", index=False)

**Reading the modified dataframe**

In [9]:
df = pd.read_csv ('/content/drive/MyDrive/next-word-prediction/next_word_prediction_dataset.csv')
df.head(3)

Unnamed: 0,text
0,We just wanted everyone to know how much we ap...
1,It frustrates politicians and public health of...
2,Mind Your Nose How smell training can change y...


**Making train, test and validation splits and checking their shapes**

In [10]:
train_old, test = train_test_split (df, test_size = 0.2, random_state =1)
train, val = train_test_split (train_old, test_size = 0.2, random_state =1)
print (train.shape, test.shape, val.shape)

(4860, 1) (1520, 1) (1216, 1)


**Converting train, test and validation sets to text format as required by the model and saving them in the drive**

In [11]:
path = r'/content/drive/MyDrive/next-word-prediction/next_word_prediction_train_set.txt'

with open(path, 'a') as f:
    train_string = train.to_string(header=False, index=False)
    f.write(train_string)

path = r'/content/drive/MyDrive/next-word-prediction/next_word_prediction_test_set.txt'

with open(path, 'a') as f:
    test_string = test.to_string(header=False, index=False)
    f.write(test_string)

path = r'/content/drive/MyDrive/next-word-prediction/next_word_prediction_val_set.txt'

with open(path, 'a') as f:
    validation_string = val.to_string(header=False, index=False)
    f.write(validation_string)

**Editing model configurations according to our requirement**

In [12]:
model_args = LanguageModelingArgs()

model_args.max_seq_length = 100
model_args.truncation = True
model_args.num_train_epochs = 4
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.output_dir = "/content/drive/MyDrive/next-word-prediction/model/"
model_args.best_model_dir = "/content/drive/MyDrive/next-word-prediction/model/"
model_args.save_best_model =True
model_args.dataset_type = "simple"
model_args.mlm = False
model_args.vocab_size = 50257
model_args.train_batch_size = 15
model_args.learning_rate = 5e-5
model_args.gradient_accumulation_steps = 8
model_args.weight_decay = 0.01
model_args.max_length = 10
model_args.do_sample = True
model_args.temperature = 1.0
model_args.top_k = 50
model_args.top_p = 0.9
model_args.repetition_penalty = 1.2
model_args.length_penalty = 1.2
model_args.num_beams = 5
model_args.no_repeat_ngram_size = 2
model_args.early_stopping = True
model_args.num_return_sequences = 5

**Loading train, test and validation data sets in the text format and initiating GPT2 model**

In [13]:
#Train and test file loading
train_file = "/content/drive/MyDrive/next-word-prediction/next_word_prediction_train_set.txt"
test_file = "/content/drive/MyDrive/next-word-prediction/next_word_prediction_test_set.txt"
validation_file = '/content/drive/MyDrive/next-word-prediction/next_word_prediction_val_set.txt'

model = LanguageModelingModel('gpt2', 'gpt2', args=model_args,  train_files=train_file)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

**Model training**

In [14]:
model.train_model(train_file, eval_file = validation_file)

  0%|          | 0/14578 [00:00<?, ?it/s]

  0%|          | 0/74210 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/4948 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/4948 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/4948 [00:00<?, ?it/s]

Running Epoch 4 of 4:   0%|          | 0/4948 [00:00<?, ?it/s]

(2472, 0.7580259013978712)

**Setting configurations for language generation model, taking user input and generating words/making predictions from the trained model**

In [15]:
Language_gen_args = LanguageGenerationArgs()
Language_gen_args.max_length = 10
Language_gen_args.early_stopping = True
Language_gen_args.max_seq_length = 100

user_input = input("Enter your text: ")

model = LanguageGenerationModel("gpt2", "/content/drive/MyDrive/next-word-prediction/model/checkpoint-2472-epoch-4/",
                                args = Language_gen_args)

output = model.generate(user_input)
output[0]

Enter your text: Today wasn’t a great day. We did the best we could. It just went on and on. A lot of people just dying in front of us. Due to the


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Today wasn’t a great day. We did the best we could. It just went on and on. A lot of people just dying in front of us. Due to the pandemic, the CDC estimates that there are about'