In [130]:
import pandas as pd
import numpy as np

# Representing words with vectors using different methods
* https://towardsdatascience.com/art-of-vector-representation-of-words-5e85c59fee5

In [131]:
texts = np.array(['Human machine interface for computer applications',
         'User opinion of computer system response time',
         'User interface management system',
         'System engineering for improved response time'
        ])

# one hot encoding
The elements are represented as 0 or 1

* By default, the get_dummies() does not do dummy encoding, but one-hot encoding.
* https://stats.stackexchange.com/questions/224051/one-hot-vs-dummy-encoding-in-scikit-learn

In [201]:
# tokenize the sentences and select the unique ones
token_text = []
for text in texts:
    for word in text.split():
        if word not in token_text:
            token_text.append(word)

In [202]:
# transform the list into a numpy array
token_text = np.array(token_text)

### using one hot encoding from sklearn.preprocessing lib

In [203]:
from sklearn.preprocessing import OneHotEncoder

In [204]:
token_text.reshape(1,-1)

array([['Human', 'machine', 'interface', 'for', 'computer',
        'applications', 'User', 'opinion', 'of', 'system', 'response',
        'time', 'management', 'System', 'engineering', 'improved']],
      dtype='<U12')

In [205]:
# instatiate a function
on = OneHotEncoder()

* Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [206]:
X_one_hot = on.fit_transform(token_text.reshape(-1, 1))

In [207]:
pd.DataFrame(X_one_hot.toarray(), columns=on.get_feature_names())

Unnamed: 0,x0_Human,x0_System,x0_User,x0_applications,x0_computer,x0_engineering,x0_for,x0_improved,x0_interface,x0_machine,x0_management,x0_of,x0_opinion,x0_response,x0_system,x0_time
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### using get_dummies from pandas

In [208]:
s = pd.DataFrame(token_text)

In [209]:
pd.get_dummies(s, prefix='token')

Unnamed: 0,token_Human,token_System,token_User,token_applications,token_computer,token_engineering,token_for,token_improved,token_interface,token_machine,token_management,token_of,token_opinion,token_response,token_system,token_time
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


# Feature extraction: bag of words
document x term matrix (DTM)
* https://scikit-learn.org/stable/modules/feature_extraction.html

Note Feature extraction is very different from Feature selection: the former consists in transforming arbitrary data, such as text or images, into numerical features usable for machine learning. The latter is a machine learning technique applied on these features.

In [47]:
texts = ['Human machine interface for computer applications',
         'User opinion of computer system response time',
         'User interface management system',
         'System engineering for improved response time'
        ]

In [48]:
from sklearn.feature_extraction.text import CountVectorizer

In [49]:
# instantiate the function
cv = CountVectorizer()

In [51]:
# fit and transform data
X = cv.fit_transform(texts)

In [54]:
pd.DataFrame(X.toarray(), columns=cv.get_feature_names())

Unnamed: 0,applications,computer,engineering,for,human,improved,interface,machine,management,of,opinion,response,system,time,user
0,1,1,0,1,1,0,1,1,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,1,1,1,1,1,1
2,0,0,0,0,0,0,1,0,1,0,0,0,1,0,1
3,0,0,1,1,0,1,0,0,0,0,0,1,1,1,0


# Distributed representation of words
* quantify co-occurrence
* term x term matrix: captures the number of times a term appears in the context of another term is created

In [210]:
# how to implement that in python?

# Singular value decomposition