#### Normalizing and stemming: find different words with different roots and count them as one 
#### For english:

In [1]:
import nltk 
porter = nltk.PorterStemmer()
# [porter.stem(t) for t in words]

##### Lemmatization: lighter version of stemming: result are valid words
#### For english: 

In [3]:
WNlemma = nltk.WordNetLemmatizer()
#[WNlemma.lemmatize(t) for t in words]

#### Tokenization: split the sentence into words in meaningful way:

In [4]:
# nltk.word_tokenize(text) 

#### Sentence splitting:

In [5]:
# nltk.sent_tokenize(text)

#### Part of speech tagging (POS): determining the role of the word in the sentence:

In [6]:
# nltk.pos_tag(text)

#### Parsing sentence structure:

In [7]:
from nltk.corpus import treebank 
# text = treebank.parsed_sents(sents)

#### finding the frequency of a token

In [8]:
from nltk.probability import FreqDist
# fdist = FreqDist(word for word in tokens)
# fdist['wordOfInterest']
# fdist.most_common(20)

## Spelling Recommender

Three different spelling recommenders, that each take a list of misspelled words and recommends a correctly spelled word for every word in the list.

For every misspelled word, the recommender finds the word in `correct_spellings` that has the shortest distance*, and starts with the same letter as the misspelled word, and return that word as a recommendation.

Each of the three different recommenders will use a different distance measure (outlined below).

Each of the recommenders provide recommendations for the three default words provided: `['cormulent', 'incendenece', 'validrate']`.

In [9]:
from nltk.corpus import words
correct_spellings = words.words()

**[Jaccard distance](https://en.wikipedia.org/wiki/Jaccard_index) on the trigrams of the two words.**

In [10]:
import pandas as pd
def j_distance01(entries=['cormulent', 'incendenece', 'validrate']):
    result = dict()
    spellings = pd.Series(correct_spellings)
    for entry in entries:
        ng_entry = set(nltk.ngrams(entry, n=3))
        spells = spellings[spellings.str.startswith(entry[0])]
        for word in spells:
            ng_word = set(nltk.ngrams(word, n=3))
            jd_dist = nltk.jaccard_distance(ng_entry, ng_word)
            if entry not in result:
                result[entry] = (word, jd_dist)
            else:
                if jd_dist < result[entry][1]:
                    result[entry] = (word, jd_dist)
    return [result[w][0] for w in result]

j_distance01()

['corpulent', 'indecence', 'validate']

**[Jaccard distance](https://en.wikipedia.org/wiki/Jaccard_index) on the 4-grams of the two words.**

In [13]:
def j_distance02(entries=['cormulent', 'incendenece', 'validrate']):
    result = dict()
    spellings = pd.Series(correct_spellings)
    for entry in entries:
        ng_entry = set(nltk.ngrams(entry, n=4))
        spells = spellings[spellings.str.startswith(entry[0])]
        for word in spells:
            ng_word = set(nltk.ngrams(word, n=4))
            jd_dist = nltk.jaccard_distance(ng_entry, ng_word)
            if entry not in result:
                result[entry] = (word, jd_dist)
            else:
                if jd_dist < result[entry][1]:
                    result[entry] = (word, jd_dist)
    
    
    return [result[w][0] for w in result]
j_distance02()

['cormus', 'incendiary', 'valid']

**[Edit distance on the two words with transpositions.](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)**

In [14]:
def e_distance(entries=['cormulent', 'incendenece', 'validrate']):
    result = dict()
    spellings = pd.Series(correct_spellings)
    for entry in entries:
        spells = spellings[spellings.str.startswith(entry[0])]
        for word in spells:
            edit_dist = nltk.edit_distance(entry, word)
            if entry not in result:
                result[entry] = (word, edit_dist)
            else:
                if edit_dist < result[entry][1]:
                    result[entry] = (word, edit_dist)
    
    
    return [result[w][0] for w in result] 
e_distance()

['corpulent', 'intendence', 'validate']

### Naive Bayes classifier with sklearn

In [20]:
from sklearn import naive_bayes
clfrNB = naive_bayes.MultinomialNB() # we can change it to alpha=0.1

In [2]:
#clfrNB.fit(train_data, train_label)
#predicted_labels = clfrNB.predict(test_data)
#metrics.f1_score(test_labels, predicted_labels, average = "micro")


### SVM classifier with sklearn

In [3]:
from sklearn import svm
clfrSVM = svm.SVC(kernel="linear", C=0.1)

In [4]:
#clfrSVM.fit(train_data, train_label)
#predicted_labels = clfrSVM.predict(test_data)

### Model Selection in sklearn

In [5]:
from sklearn import model_selection
#X_train, X_test, y_train, y_test = model_selection.train_test_split(train_data, train_labels, test_size = 0.33, random_state = 0)

#### cross validation

In [6]:
#predicted_labels = model_selection.cross_val_predict(clfrSVM, train_data, train_labels, cv = 5) #commonly cv=10

### Naive Bayes classifier with NLTK

In [7]:
from nltk.classify import NaiveBayesClassifier

In [8]:
#classifier = NaiveBayesClassifier.train(train_set)
#classifier.classify(unlabled_data)
#classifier.classify.many(unlabled_data)
#nltk.classify.util.accuracy(classifier, test_srt)
#classifier.labels() #gives you all the lables that the classifier has trained on
#classifier.show_most_informative_features()

#### NLTK doesn't have SVM, so we use sklearn:

In [9]:
from nltk.classify import SklearnClassifier

In [10]:
#clfrNB = SklearnClassifier(MultinomialNB()).train(train_set)
#clfrSVM = SklearnClassifier(SVC(), kernel="linear").train(train_set)
# the rest are similar to sklearn

#### We'll need to convert the text into a numeric representation that scikit-learn can use.
#### The bag-of-words approach is simple and commonly used way to represent text for use in machine learning, which ignores structure and only counts how often each word occurs. CountVectorizer allows us to use the bag-of-words approach by converting a collection of text documents into a matrix of token counts.
#### First, we instantiate the CountVectorizer and fit it to our training data.
#### Fitting the CountVectorizer consists of the tokenization of the trained data and building of the vocabulary.

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

# Fit the CountVectorizer to the training data
# vect = CountVectorizer().fit(X_train)

# vect.get_feature_names()[::2000]
# len(vect.get_feature_names())

# transform the documents in the training data to a document-term matrix
# X_train_vectorized = vect.transform(X_train)

#### This representation is stored in a SciPy sparse matrix, where each row corresponds to a document and each column a word from our training vocabulary.
#### We'll use LogisticRegression, which works well for high dimensional sparse data.

In [12]:
from sklearn.linear_model import LogisticRegression

# Train the model
model = LogisticRegression()
# model.fit(X_train_vectorized, y_train)

In [13]:
from sklearn.metrics import roc_auc_score

# Predict the transformed test documents
#predictions = model.predict(vect.transform(X_test))

#print('AUC: ', roc_auc_score(y_test, predictions))

In [14]:
#### get the feature names as numpy array
#feature_names = np.array(vect.get_feature_names())

#### Sort the coefficients from the model
#sorted_coef_index = model.coef_[0].argsort()

#### Find the 10 smallest and 10 largest coefficients
#### The 10 largest coefficients are being indexed using [:-11:-1] 
#### so the list returned is in order of largest to smallest
#print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
#print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

#### There is a different approach, which allows us to rescale features called tf–idf.
#### Tf–idf, or Term frequency-inverse document frequency, allows us to weight terms based on how important they are to a document.
#### Features with high tf–idf are frequently used within specific documents, but rarely used across all documents.

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5
#vect = TfidfVectorizer(min_df=5).fit(X_train)
#len(vect.get_feature_names())

#X_train_vectorized = vect.transform(X_train)

#model = LogisticRegression()
#model.fit(X_train_vectorized, y_train)

#predictions = model.predict(vect.transform(X_test))

#print('AUC: ', roc_auc_score(y_test, predictions))

#feature_names = np.array(vect.get_feature_names())

#sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

#print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
#print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

#sorted_coef_index = model.coef_[0].argsort()

#print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
#print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

#### CountVectorizor and tf–idf Vectorizor both take an argument, mindf, which allows us to specify a minimum number of documents in which a token needs to appear to become part of the vocabulary.
#### This helps us remove some words that might appear in only a few and are unlikely to be useful predictors. For example, here we'll pass in min_df = 5, which will remove any words from our vocabulary that appear in fewer than five documents.

# n-grams

In [17]:
#### Fit the CountVectorizer to the training data specifiying a minimum 
#### document frequency of 5 and extracting 1-grams and 2-grams
#vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)

#X_train_vectorized = vect.transform(X_train)

#len(vect.get_feature_names())

In [18]:
#model = LogisticRegression()
#model.fit(X_train_vectorized, y_train)

#predictions = model.predict(vect.transform(X_test))

#print('AUC: ', roc_auc_score(y_test, predictions))

In [19]:
#feature_names = np.array(vect.get_feature_names())

#sorted_coef_index = model.coef_[0].argsort()

#print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
#print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

#### if we want to add more features to the sparse matris to see if we can improve our performance, we can use the following function:

In [21]:
def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    """
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

In [1]:
##### for example adding the length of the document as a featute:
#X_train_n = add_feature(X_train_vectorized, X_train.apply(len))

## For text similarity, topic modeling and information extraction go to week for of text mining 