In [11]:
import numpy as np
import pandas as pd
import textdistance
import re
from collections import Counter

## File opening and cleaning

In [5]:
data = ""
with open('training_text.txt', 'r', encoding='utf-8') as f:
    data = f.read()

print(data)

=== C:\Users\James\Documents\Programming\BouncyBallStudio\BallStateInterface\BallStateContainer.h ===
#pragma once

#include <memory>

#include "BallStateInterface.h"


class BallStateContainer {

public:
	BallStateContainer();

	// Add a state to the container.
	// The state can then be accessed either by name or by ID.
	// If you do not specify an ID, the container will assign a default ID
	// that starts at 0 and increments by 1 for each state you add.
	void Add(std::shared_ptr<BallStateInterface> state, int id = -1);

	// Retrieve a state by name. Throws invalid_argument exception if not found.
	std::shared_ptr<BallStateInterface> Get(std::string state_name);

	// Retrieve a state by ID. Throws invalid_argument exception if not found.
	std::shared_ptr<BallStateInterface> Get(int state_id);

private:
	struct Impl;
	std::shared_ptr<Impl> impl;
	int idCounter = 0;

};



=== C:\Users\James\Documents\Programming\BouncyBallStudio\BallStateInterface\BallStateInterface.h ===
#pragma once


In [19]:
# Regex expression matches all words that start with a letter (excludes numbers and other things)
words = re.findall('[A-Za-z]\w+', data)
print(words)

['Users', 'James', 'Documents', 'Programming', 'BouncyBallStudio', 'BallStateInterface', 'BallStateContainer', 'pragma', 'once', 'include', 'memory', 'include', 'BallStateInterface', 'class', 'BallStateContainer', 'public', 'BallStateContainer', 'Add', 'state', 'to', 'the', 'container', 'The', 'state', 'can', 'then', 'be', 'accessed', 'either', 'by', 'name', 'or', 'by', 'ID', 'If', 'you', 'do', 'not', 'specify', 'an', 'ID', 'the', 'container', 'will', 'assign', 'default', 'ID', 'that', 'starts', 'at', 'and', 'increments', 'by', 'for', 'each', 'state', 'you', 'add', 'void', 'Add', 'std', 'shared_ptr', 'BallStateInterface', 'state', 'int', 'id', 'Retrieve', 'state', 'by', 'name', 'Throws', 'invalid_argument', 'exception', 'if', 'not', 'found', 'std', 'shared_ptr', 'BallStateInterface', 'Get', 'std', 'string', 'state_name', 'Retrieve', 'state', 'by', 'ID', 'Throws', 'invalid_argument', 'exception', 'if', 'not', 'found', 'std', 'shared_ptr', 'BallStateInterface', 'Get', 'int', 'state_id', 

In [20]:
len(words)

4852

In [21]:
len(set(words))

748

In [22]:
set(words)

{'ACCELERATION_DUE_TO_GRAVITY',
 'Adapted',
 'Add',
 'AddDLLToNameMap',
 'AddDataOutputInterface',
 'AddField',
 'AddPoint',
 'AddViewButton',
 'AddViewChoices',
 'AddViewDataNameTextCtrl',
 'AddViewLabel',
 'AddViewPanel',
 'Addition',
 'AdditionFunction',
 'Adds',
 'After',
 'Apply',
 'ApplyAcceleration',
 'ApplyVelocity',
 'AutotuneType_Motor',
 'AutotuneType_TEC',
 'Axes',
 'BEGIN_EVENT_TABLE',
 'BS_Height',
 'BS_Velocity',
 'Ball',
 'BallStateContainer',
 'BallStateDataOutput',
 'BallStateDataOutputInterface',
 'BallStateInterface',
 'BallState_Height',
 'BallState_Velocity',
 'BallStates',
 'Bind',
 'Bounce',
 'Bouncy',
 'BouncyBallApp',
 'BouncyBallStudio',
 'BouncyBallStudio_Console',
 'BouncyBallStudio_GUI',
 'CHOICE_CANVAS',
 'CHOICE_SLIDER',
 'CHOICE_SLIDER_UNSIGNED',
 'CHOICE_TEXT',
 'Can',
 'Canvas',
 'CanvasView',
 'Centre',
 'Clear',
 'ClearCurrentlyLoadedDLL',
 'Close',
 'Controls',
 'ControlsPanel',
 'Convert',
 'Couldn',
 'Create',
 'CreateTechniqueInstance',
 'Create

In [23]:
word_freq_dict = Counter(words)
word_freq_dict

Counter({'Users': 49,
         'James': 49,
         'Documents': 49,
         'Programming': 49,
         'BouncyBallStudio': 59,
         'BallStateInterface': 43,
         'BallStateContainer': 11,
         'pragma': 27,
         'once': 27,
         'include': 116,
         'memory': 5,
         'class': 30,
         'public': 47,
         'Add': 30,
         'state': 11,
         'to': 27,
         'the': 39,
         'container': 2,
         'The': 6,
         'can': 4,
         'then': 2,
         'be': 9,
         'accessed': 1,
         'either': 1,
         'by': 13,
         'name': 15,
         'or': 2,
         'ID': 4,
         'If': 3,
         'you': 2,
         'do': 1,
         'not': 6,
         'specify': 1,
         'an': 5,
         'will': 5,
         'assign': 1,
         'default': 5,
         'that': 3,
         'starts': 1,
         'at': 2,
         'and': 22,
         'increments': 1,
         'for': 26,
         'each': 3,
         'add': 1,
         'void

In [24]:
word_freq_dict.most_common(10)

[('void', 148),
 ('include', 116),
 ('std', 84),
 ('string', 80),
 ('const', 77),
 ('int', 74),
 ('BouncyBallStudio', 59),
 ('if', 58),
 ('declspec', 52),
 ('dllexport', 52)]

## Relative frequency of words

In [29]:
probs = {}
for word, freq in word_freq_dict.items():
    probs[word] = freq / len(words)

In [30]:
probs

{'Users': 0.010098928276999175,
 'James': 0.010098928276999175,
 'Documents': 0.010098928276999175,
 'Programming': 0.010098928276999175,
 'BouncyBallStudio': 0.012159934047815334,
 'BallStateInterface': 0.00886232481450948,
 'BallStateContainer': 0.002267106347897774,
 'pragma': 0.005564715581203627,
 'once': 0.005564715581203627,
 'include': 0.023907666941467436,
 'memory': 0.001030502885408079,
 'class': 0.006183017312448475,
 'public': 0.009686727122835944,
 'Add': 0.006183017312448475,
 'state': 0.002267106347897774,
 'to': 0.005564715581203627,
 'the': 0.008037922506183017,
 'container': 0.00041220115416323167,
 'The': 0.001236603462489695,
 'can': 0.0008244023083264633,
 'then': 0.00041220115416323167,
 'be': 0.0018549051937345425,
 'accessed': 0.00020610057708161583,
 'either': 0.00020610057708161583,
 'by': 0.0026793075020610057,
 'name': 0.0030915086562242375,
 'or': 0.00041220115416323167,
 'ID': 0.0008244023083264633,
 'If': 0.0006183017312448475,
 'you': 0.0004122011541632

## Finding similar words

Now we will sort similar words according to the Jaccard distance by calculating the 2 grams Q of the words.  
Next, we will return the 5 most similar words ordered by similarity and probability.

The Jaccard distance measures the dissimilarity between two sets by comparing their intersection and union.

In [45]:
def autocorrect(word, num_matches=5):

    word = word.lower()
    if word in probs:
        print("The word is already there", word)
    else:
        similarities = [1-(textdistance.Jaccard(qval=2)).distance(w, word) for w in word_freq_dict.keys()]
        df = pd.DataFrame.from_dict(probs, orient='index').reset_index()
        df = df.rename(columns={"index":"word", 0: "Prob"})
        df["Similarity"] = similarities
        output = df.sort_values(["Similarity", "Prob"], ascending=False).head(num_matches)
        return output

In [50]:
autocorrect("velo")

Unnamed: 0,word,Prob,Similarity
740,velocity,0.003916,0.428571
400,Hello,0.000206,0.4
216,developed,0.000206,0.375
275,development,0.000206,0.3
119,velocity_m_s,0.00371,0.272727


In [40]:
similarities = [1-(textdistance.Jaccard(qval=2)).distance(w, word) for w in word_freq_dict.keys()]
df = pd.DataFrame.from_dict(probs, orient='index').reset_index()
df = df.rename(columns={"index":"word", 0: "Prob"})
df["Similarity"] = similarities
output = df.sort_values(["Similarity", "Prob"], ascending=False)
output

Unnamed: 0,word,Prob,Similarity
747,dampening,0.000206,1.000000
289,Undamped,0.000618,0.363636
346,Damped,0.000412,0.300000
236,running,0.001237,0.272727
69,implementing,0.000206,0.266667
...,...,...,...
732,uses,0.000206,0.000000
733,widgets,0.000206,0.000000
735,declarations,0.000206,0.000000
744,Reduce,0.000206,0.000000
