# Clean up the Financial Tweets data

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk 
import string
import re
import itertools
%matplotlib inline
pd.set_option('display.max_colwidth', 100)

In [3]:
df = pd.read_csv("archive/stockerbot-export.csv")

In [5]:
def clean_text(sentence):
    # convert to lower case
    sentence = sentence.lower()
    # remove URLs
    sentence = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', sentence)
    # remove ampersands
    sentence = re.sub('&amp;', '', sentence)
    # remove financial acronyms
    sentence = re.sub('\\$[a-z]+', '', sentence)
    # remove punctuation and other characters
    sentence = re.sub('[^a-z ]', '', sentence)
    return sentence

In [6]:
sentence = ''
for i in range(len(df)):
    sentence += df['text'][i]
cleaned_text = clean_text(sentence)

In [9]:
all_bigrams = [''.join(x) for x in itertools.combinations('abcdefghijklmnopqrstuvwxyz',2)]

counts = {bigram: 0 for bigram in all_bigrams}
for i in range(len(cleaned_text) - 1):
    bigram = ''.join(sorted(cleaned_text[i:i+2]))
    if bigram in counts:
        counts[bigram] += 1


The results of the weight for each letter pair

In [28]:
sorted(counts.items(), key=lambda x:-x[1])

[('er', 44109),
 ('in', 40899),
 ('ar', 29110),
 ('es', 28073),
 ('or', 27803),
 ('st', 27369),
 ('an', 27011),
 ('no', 24938),
 ('it', 24248),
 ('et', 22645),
 ('en', 21483),
 ('at', 20169),
 ('al', 19878),
 ('ot', 19731),
 ('de', 19526),
 ('is', 16403),
 ('gn', 15834),
 ('rt', 15534),
 ('co', 15422),
 ('ce', 15341),
 ('il', 15004),
 ('ht', 14856),
 ('fo', 14569),
 ('el', 12934),
 ('io', 12224),
 ('eh', 11725),
 ('as', 11518),
 ('lo', 11197),
 ('ad', 10994),
 ('nt', 10774),
 ('em', 10717),
 ('ir', 10713),
 ('am', 10639),
 ('mo', 10500),
 ('ac', 10182),
 ('ae', 9926),
 ('ev', 9903),
 ('ci', 9764),
 ('ah', 9511),
 ('ou', 9223),
 ('cn', 9209),
 ('op', 8735),
 ('di', 8523),
 ('ep', 8367),
 ('su', 8286),
 ('eg', 8205),
 ('ap', 7703),
 ('ai', 7588),
 ('dn', 7529),
 ('os', 7493),
 ('pr', 7233),
 ('ew', 7205),
 ('im', 7051),
 ('gr', 7039),
 ('ch', 6953),
 ('tu', 6716),
 ('gi', 6572),
 ('ck', 6160),
 ('ru', 6144),
 ('hs', 6127),
 ('ow', 5969),
 ('nu', 5778),
 ('hi', 5680),
 ('ek', 5669),
 ('pu

# The following cells are to run simulations

In [12]:
# The groups are imported from Selena's code
groups =[['e', 'u', 'z'], ['c', 'g', 'w'], ['s', 'm', 'f', 'v'], ['a', 'o', 'q'], ['t', 'd', 'k'], ['i', 'y', 'x', 'j'], ['n', 'p', 'b'], ['r', 'l', 'h']]

n_keys = len(groups)

# make the weight matrix for the groups
W = np.zeros((n_keys, n_keys), dtype=int)
for i in range(n_keys):
    for j in range(n_keys):
        if j == i:
            continue
        total_weight = 0
        for letter_A in groups[i]:
            for letter_B in groups[j]:
                total_weight += counts[''.join(sorted(letter_A + letter_B))]
        W[i, j] = total_weight
W

array([[     0,  34835,  64063,  27098,  56596,  17412,  49490,  81280],
       [ 34835,      0,   8175,  42226,  14763,  22591,  27596,  27092],
       [ 64063,   8175,      0,  63607,  37088,  39305,  16019,  20881],
       [ 27098,  42226,  63607,      0,  59993,  30964,  76719, 103049],
       [ 56596,  14763,  37088,  59993,      0,  38407,  24241,  42459],
       [ 17412,  22591,  39305,  30964,  38407,      0,  56439,  39507],
       [ 49490,  27596,  16019,  76719,  24241,  56439,      0,  20761],
       [ 81280,  27092,  20881, 103049,  42459,  39507,  20761,      0]])

In [13]:
## print it to Latex code
# print('\\begin{bmatrix}')
# for i in range(W.shape[0]):
#     print(' & '.join(map(str, W[i])) + '\\\\')
# print('\\end{bmatrix}')

\begin{bmatrix}
0 & 34835 & 64063 & 27098 & 56596 & 17412 & 49490 & 81280\\
34835 & 0 & 8175 & 42226 & 14763 & 22591 & 27596 & 27092\\
64063 & 8175 & 0 & 63607 & 37088 & 39305 & 16019 & 20881\\
27098 & 42226 & 63607 & 0 & 59993 & 30964 & 76719 & 103049\\
56596 & 14763 & 37088 & 59993 & 0 & 38407 & 24241 & 42459\\
17412 & 22591 & 39305 & 30964 & 38407 & 0 & 56439 & 39507\\
49490 & 27596 & 16019 & 76719 & 24241 & 56439 & 0 & 20761\\
81280 & 27092 & 20881 & 103049 & 42459 & 39507 & 20761 & 0\\
\end{bmatrix}


In [24]:
# calculate the distance for the 8 keys
from sklearn.metrics import pairwise_distances
coords = np.array([
    [0, 0],
    [0, 1],
    [0, 2],
    [1, 0],
    [1, 1],
    [1, 2],
    [2, 0],
    [2, 1],
])
D = pairwise_distances(coords)

In [16]:
# make the permutation matrices iteratively, 
# and record the one that minimizes the objective function
from itertools import permutations
min_val = np.inf
best_P = None
best_permutation = None

for permutation in permutations(list(range(8)), 8):
    P = np.zeros((8, 8))
    for i, x in enumerate(permutation):
        P[i, x] = 1
    
    val = np.sum(W * (P @ D @ P.T))
    if val < min_val:
        best_P = P
        best_permutation = permutation
        min_val = val

In [17]:
min_val

3230525.414373288

In [18]:
best_permutation

(0, 2, 6, 4, 3, 7, 5, 1)

In [19]:
# add a key for the Space
groups_permuted = [groups[b] for b in best_permutation] + [[' ']]
groups_permuted

[['e', 'u', 'z'],
 ['s', 'm', 'f', 'v'],
 ['n', 'p', 'b'],
 ['t', 'd', 'k'],
 ['a', 'o', 'q'],
 ['r', 'l', 'h'],
 ['i', 'y', 'x', 'j'],
 ['c', 'g', 'w'],
 [' ']]

In [21]:
# import the old layout for comparision in the simulation
groups_old = [['a', 'b', 'c'],
            ['d', 'e','f'],
            ['g', 'h','i'],
            ['j','k','l'],
            ['m','n','o'],
            ['p','q','r','s'],
            ['t','u','v'],
            ['w','x','y','z'],
            [' ']]

In [22]:
# the simulation
# see the report for details
dt_short = 0.1
dt_long = 1.0
dist_const = 0.1

def time_taken(text, groups_list):
    # make the letter_to_group_number
    letter_to_group_number = {}
    for i, group in enumerate(groups_list):
        for letter in group:
            letter_to_group_number[letter] = i
    
    time = 0.0
    inter_group_transitions = 0
    outer_group_transitions = 0
    dist = 0
    index = 0
    
    curr_group = letter_to_group_number[text[0]]

    for letter in text:
        new_group = letter_to_group_number[letter]
        if curr_group == new_group:
            inter_group_transitions += 1
            time += dt_long
        else:
            outer_group_transitions += 1
            time += D[curr_group, new_group] * dist_const
            dist += D[curr_group, new_group]
        index_new = groups_list[new_group].index(letter)
        index += index_new
        time += (index_new + 1) * dt_short

        curr_group = new_group

    time -= dt_long
    
    return time, inter_group_transitions, outer_group_transitions, dist, index



In [None]:
# print the result for simulation
def print_results(text):
    time_old, igt_old, ogt_old, dist_old, index_old = time_taken(text, groups_old)
    time_new, igt_new, ogt_new, dist_new, index_new = time_taken(text, groups_permuted)
    
    d = {
        'layout': ['Old Layout', 'New Layout', 'New/Old'],
        'time': [time_old, time_new, time_new / time_old],
        'intra-group transitions': [igt_old, igt_new, igt_new / igt_old],
        'inter-group transitions': [ogt_old, ogt_new, ogt_new / ogt_old],
        'distance': [dist_old, dist_new, dist_new / dist_old],
        'index': [index_old, index_new, index_new / index_old],
        'percent intra-group transitions': [igt_old / (igt_old+ogt_old), igt_new / (igt_new+ogt_new),
                                           igt_new / igt_old]
    }
    df = pd.DataFrame(data=d)
    
    return df


In [4]:
# open two other testing texts
with open('ken_email.txt') as f:
    ken_email = f.read()
with open('kinglear.txt') as f:
    kinglear = f.read()
cleaned_email = clean_text(re.sub(' +', ' ', ken_email.replace('\n', ' ')))
cleaned_kinglear = clean_text(re.sub(' +', ' ', kinglear.replace('\n', ' ')))

In [25]:
# Calculate the new distance matrix with the Space key added
from sklearn.metrics import pairwise_distances
coords = np.array([
    [0, 0],
    [0, 1],
    [0, 2],
    [1, 0],
    [1, 1],
    [1, 2],
    [2, 0],
    [2, 1],
    [2, 2]
])
D = pairwise_distances(coords)

In [30]:
# text = 'the quick brown fox jumps over the lazy dog'
print('cleaned_text:')
display(print_results(cleaned_text))
print('cleaned_email:')
display(print_results(cleaned_email))
print('cleaned_kinglear:')
display(print_results(cleaned_kinglear))

cleaned_text:


Unnamed: 0,layout,time,intra-group transitions,inter-group transitions,distance,index,percent intra-group transitions
0,Old Layout,919431.029465,287570.0,1601682.0,2642915.0,1786453.0,0.152214
1,New Layout,734699.662739,187839.0,1701413.0,2819761.0,759604.0,0.099425
2,New/Old,0.799081,0.653194,1.062266,1.066913,0.4252023,0.653194


cleaned_email:


Unnamed: 0,layout,time,intra-group transitions,inter-group transitions,distance,index,percent intra-group transitions
0,Old Layout,1625.970345,328.0,3400.0,5787.70345,3474.0,0.087983
1,New Layout,1263.503446,96.0,3632.0,6258.034459,1699.0,0.025751
2,New/Old,0.777077,0.292683,1.068235,1.081264,0.489062,0.292683


cleaned_kinglear:


Unnamed: 0,layout,time,intra-group transitions,inter-group transitions,distance,index,percent intra-group transitions
0,Old Layout,60659.038919,10879.0,131952.0,221189.389187,133790.0,0.076167
1,New Layout,48293.015792,3451.0,139380.0,236883.157919,68716.0,0.024161
2,New/Old,0.796139,0.317217,1.056293,1.070952,0.513611,0.317217
