<a href="https://colab.research.google.com/github/karthikcs/colab/blob/master/Text_Pre_Processing_V1_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Text Cleaning

In [0]:
# from gensim.models import Word2Vec, KeyedVectors
import pandas as pd
import nltk

### Configuration from the user
In section, user can specify some of the configuration needed for the tool to run. Example: Input file url, key column to consider, additional stop words etc

In [0]:
## Configuration
input_file = r'https://storage.googleapis.com/karthik101/Kaplan%20Tickets.csv'
description_field = 'Description:'
more_stopwords = ['hi', 'hello', 'an', 'please', 'pls', 'dear', 'dears', 'from', 'll', 'bo', 'inc']

### Input  Data
The data which we are trying to process is a ticket history from past one year. It contains more than 20,000 ticket information. The size of the file is almost 50MB. Github supports not more than 25MB, and hence using the google storage from GCP. (Account : karthikcs101)

In [0]:
df = pd.read_csv(input_file, encoding = "ISO-8859-1")

### Data Cleaning
Before we start processing the data, we need to perform following pre-processing

We have 2 fields which might affect the topic modelling, Short Description and Long Description. Sometimes one might be important and other time other one. So. we are planning to concatenate both fields before doing anything

1.   **Gensim simple_preprocess** - Convert a document into a list of tokens. This lowercases, tokenizes and converts to deaccents
2.   **Removing Stopwords** - Removes the stopwords from Spacy. It also removes additional user specified Stopwords
3. **Lemmatize** - Using Spacy, converts all the words to Lemmatized words. Example: *message*, *messages*, *messaging* - all gets converted to root word - *message*







In [0]:
df['key_text'] = df[description_field]
data = df.key_text.values.tolist()

In [0]:
from gensim.utils import simple_preprocess
import spacy

nlp = spacy.load('en')
stop_words = nlp.Defaults.stop_words

In [0]:
def replace_underscore(sentences):
  for sentence in sentences:
    yield(sentence.replace('_', ' '))

def simple_processing(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True))  
        
def remove_stopwords(texts):
  return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]        

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [0]:
data1 = list(replace_underscore(data))

In [31]:
data1[0:10]

['FAABoost INC4649505 Testing after migration',
 'Genesys Items 02/25-02/28',
 'WELD INC4649511 Testing after migration',
 'Sharepoint INC4645781 Add Chief Experience Officer to SP&A form',
 'FAABoost INC4646535 FAA move in DEV',
 'Necesito que me ayudes a crear unas cuentas en test porfavor',
 'WELD INC4644786 First Data Password Reset',
 'ECMS INC4645219 Create a report of all assets in Production',
 'BT INC4645196 Biztalk Health Check Report 02/18/2019 - 02/22/2019',
 '1098T INC4645659 Write Off Issue']

In [0]:
## Step 1: Data simple processing
data_words = list(simple_processing(data1))

In [33]:
data_words[0:10]

[['faaboost', 'inc', 'testing', 'after', 'migration'],
 ['genesys', 'items'],
 ['weld', 'inc', 'testing', 'after', 'migration'],
 ['sharepoint',
  'inc',
  'add',
  'chief',
  'experience',
  'officer',
  'to',
  'sp',
  'form'],
 ['faaboost', 'inc', 'faa', 'move', 'in', 'dev'],
 ['necesito',
  'que',
  'me',
  'ayudes',
  'crear',
  'unas',
  'cuentas',
  'en',
  'test',
  'porfavor'],
 ['weld', 'inc', 'first', 'data', 'password', 'reset'],
 ['ecms',
  'inc',
  'create',
  'report',
  'of',
  'all',
  'assets',
  'in',
  'production'],
 ['bt', 'inc', 'biztalk', 'health', 'check', 'report'],
 ['inc', 'write', 'off', 'issue']]

In [0]:
# Step 2: Removing Stopwords 
# Skipping for now 
stop_words = nlp.Defaults.stop_words
stp_list = list(stop_words)
stp_list.extend(more_stopwords)
stop_words = set(stp_list)
data_words_nostops = remove_stopwords(data_words)
# data_words_nostops[:10]

In [0]:
## Step 3: Lemmatize the data words
## Skipping for now 
# data_lemmatized = lemmatization(data_words_nostops)
# data_lemmatized[:10]

In [36]:
tokanized_sentenaces = data_words_nostops
tokanized_sentenaces[:10]

[['faaboost', 'testing', 'migration'],
 ['genesys', 'items'],
 ['weld', 'testing', 'migration'],
 ['sharepoint', 'add', 'chief', 'experience', 'officer', 'sp', 'form'],
 ['faaboost', 'faa', 'dev'],
 ['necesito',
  'que',
  'ayudes',
  'crear',
  'unas',
  'cuentas',
  'en',
  'test',
  'porfavor'],
 ['weld', 'data', 'password', 'reset'],
 ['ecms', 'create', 'report', 'assets', 'production'],
 ['bt', 'biztalk', 'health', 'check', 'report'],
 ['write', 'issue']]

In [0]:
def convert_wordlist_to_text(wordlist):
  out_list = []
  for item in wordlist:
    str1 = ' '.join(item)
    out_list.append(str1)
  return out_list  

In [0]:
clean_text = convert_wordlist_to_text(tokanized_sentenaces)
df['clean_text'] = clean_text

In [46]:
df.to_csv('clean_text.csv')
!curl -X POST --data-binary @'clean_text.csv' -H "Content-Type: text/csv" "https://www.googleapis.com/upload/storage/v1/b/karthik101/o?uploadType=media&name=clean_text.csv"

{
 "kind": "storage#object",
 "id": "karthik101/clean_text.csv/1561994722196827",
 "selfLink": "https://www.googleapis.com/storage/v1/b/karthik101/o/clean_text.csv",
 "name": "clean_text.csv",
 "bucket": "karthik101",
 "generation": "1561994722196827",
 "metageneration": "1",
 "contentType": "text/csv",
 "timeCreated": "2019-07-01T15:25:22.196Z",
 "updated": "2019-07-01T15:25:22.196Z",
 "storageClass": "MULTI_REGIONAL",
 "timeStorageClassUpdated": "2019-07-01T15:25:22.196Z",
 "size": "331459",
 "md5Hash": "bEIkcZbswH9qIvVpzobiCw==",
 "mediaLink": "https://www.googleapis.com/download/storage/v1/b/karthik101/o/clean_text.csv?generation=1561994722196827&alt=media",
 "crc32c": "TPNXxQ==",
 "etag": "CNv6uuSDlOMCEAE="
}
