# Import libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import json
import sys
import os
import torch

import pandas as pd
from tqdm import tqdm

tqdm.pandas()

import transformers

data_location = "data"
model_name = 'gpt2'
cache_dir = 'cache'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
min_words_sample = 200

# Import data

In [6]:
data = pd.read_csv(data_location + "/guardian_articles.csv")
data = data.dropna()

In [7]:
data

0        AMERIA Investment Consulting Company is seekin...
2        Public outreach and strengthening of a growing...
3        The LEAD (Local Enhancement and Development fo...
5           Saleswoman will sell menswear and accessories.
6        The Armenian Branch Office of the Open Society...
                               ...                        
18992    The incumbent will develop software applicatio...
18995    The incumbent will be responsible for supporti...
18996    A tech startup of Technolinguistics based in N...
18999    San Lazzaro LLC is looking for a well-experien...
19000    "Kamurj" UCO CJSC is looking for a Lawyer in L...
Name: JobDescription, Length: 15109, dtype: object

# Create samples

In [11]:
def create_range_point_dict(point_indices, len_sample):
    result = {}
    for point in point_indices:
        max_range = len_sample - point + 1
        if max_range < min_words:
            result[point] = None
        else:
            max_range = min(max_words, max_range)
            result[point] = max_range

    if all(value is None for value in result.values()):
        return False
    else:
        result = {k: v for k, v in result.items() if v is not None}
        return result


def sample_text(sample):
    sample = sample.split()
    point_indices = [i for i in range(len(sample)) if "." in sample[i]]

    if len(sample) < min_words_sample or len(point_indices) == 0:
        return None

    ranges = create_range_point_dict(point_indices, len(sample))

    if ranges == False:
        return None

    # Plus one, as we want to start after a dot
    point_chosen_begin = np.random.choice(list(ranges.keys())) + 1

    # Minus one, as we want to choose the specific item
    chosen_length = min_words if ranges[point_chosen_begin - 1] == min_words else np.random.randint(min_words, ranges[point_chosen_begin - 1])
    point_chosen_end = point_chosen_begin + chosen_length

    return ' '.join(sample[point_chosen_begin : point_chosen_end])

In [12]:
min_words = 55
max_words = 200

In [208]:
data["sampled"] = data["bodyContent"].progress_apply(lambda x: sample_text(x))
data = data.dropna()

100%|██████████| 148731/148731 [00:26<00:00, 5582.39it/s]


# Preprocess data

In [209]:
def process_spaces(story):
    return story.replace(
        ' ,', ',').replace(
        ' .', '.').replace(
        ' ?', '?').replace(
        ' !', '!').replace(
        ' ;', ';').replace(
        ' \'', '\'').replace(
        ' ’ ', '\'').replace(
        ' :', ':').replace(
        '<newline>', '\n').replace(
        '`` ', '"').replace(
        ' \'\'', '"').replace(
        '\'\'', '"').replace(
        '.. ', '... ').replace(
        ' )', ')').replace(
        '( ', '(').replace(
        ' n\'t', 'n\'t').replace(
        ' i ', ' I ').replace(
        ' i\'', ' I\'').replace(
        '\\\'', '\'').replace(
        '\n ', '\n').strip()

In [212]:
data["sampled"] = data["sampled"].progress_apply(lambda x: process_spaces(x))

100%|██████████| 143144/143144 [00:02<00:00, 65874.79it/s]


In [218]:
data["sampled"].values[1]

'Leffler’s tractor also features pictures of wolves – because “I just like ‘em,” he said – and an Iowa buck, of which as a “big-time deer hunter” he shoots “two or three” a year. Asked about his support for Huckabee, given his parlous position in the polls, Leffler said: “Doesn’t matter. “I’m a principled man, and that’s the reason I’m going for Mike Huckabee. No1 his stand on guns: perfect. His stand on pro-life: perfect. He’s the only man who can stand up and say, ‘I’ve beaten the Clinton machine.’ Thirdly, as Ronald Reagan said, our best presidents have come from the governors program. “When you add it all up, he’s a man of God and when you look at the credentials of what he can do, I have to go with Mike Huckabee. To me it’s not even a choice, it’s just clear-cut.” Leffler also said that if Huckabee is not the Republican'

In [136]:
data["bodyContent"].isna().sum()

0

In [128]:
sampled_df["bodyContent"].dropna().isna().sum()

0

In [126]:
for x in sampled_df["bodyContent"].str.split():
    if x == "" or x == np.nan:
        print("!!!!!!!!!")
    print(x)
    print(len(x))

['As', 'polling', 'day', 'looms', 'and', 'the', 'cameras', 'turn', 'only', 'toward', 'the', 'contenders', 'to', 'win', 'the', 'Iowa', 'caucuses,', 'at', 'least', 'some', 'around', 'nation', 'can’t', 'help', 'but', 'wonder:', 'what', 'happened', 'to', 'the', 'other', 'guys?', 'On', 'the', 'final', 'weekend', 'before', 'the', 'Iowa', 'caucuses,', 'the', 'presidential', 'candidates', 'straggling', 'behind', 'the', 'leaders', '–', 'Donald', 'Trump,', 'Ted', 'Cruz', 'and', 'Marco', 'Rubio', 'for', 'the', 'Republicans,', 'Hillary', 'Clinton', 'and', 'Bernie', 'Sanders', 'for', 'the', 'Democrats', '–', 'carried', 'on', 'with', 'brave', 'faces,', 'despite', 'sinking', 'poll', 'numbers', 'and', 'ambivalent', 'voters.', 'Underdog', 'candidates', 'could,', 'however,', 'have', 'an', 'outsize', 'effect', 'on', 'the', 'campaign:', 'their', 'supporters', 'represent', 'the', 'spread', 'between', 'the', 'leaders.', 'Sway', 'voters', 'who', 'prefer', 'a', 'third-', '(or', 'fourth-', 'or', 'fifth-)', 'ti

TypeError: object of type 'float' has no len()

# Job posts data

In [15]:
data = pd.read_csv(data_location + "/data job posts.csv")
data = data.dropna(subset=["JobDescription"])

In [19]:
def sample_beginning_text(sample):
    sample = sample.split()
    if len(sample) < min_words_sample:
        return None

    # Plus one, as we want to start after a dot
    point_chosen_begin = 0

    max_range = min(max_words, len(sample) - point_chosen_begin)

    # Minus one, as we want to choose the specific item
    chosen_length = np.random.randint(min_words, max_range)
    point_chosen_end = point_chosen_begin + chosen_length

    return ' '.join(sample[point_chosen_begin : point_chosen_end])