In [3]:
import pandas as pd

In [4]:
df = pd.read_excel("./data/wos.xlsx")

In [5]:
df.head()

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract
0,0,12,12,CS,Symbolic computation,(2+1)-dimensional non-linear optical waves; e...,(2 + 1)-dimensional non-linear optical waves t...
1,5,2,74,Medical,Alzheimer's Disease,Aging; Tau; Amyloid; PET; Alzheimer's disease...,(beta-amyloid (A beta) and tau pathology becom...
2,4,7,68,Civil,Green Building,LED lighting system; PV system; Distributed l...,(D)ecreasing of energy consumption and environ...
3,1,10,26,ECE,Electric motor,NdFeB magnets; Electric motor; Electric vehic...,(Hybrid) electric vehicles are assumed to play...
4,5,43,115,Medical,Parkinson's Disease,Parkinson's disease; dyskinesia; adenosine A(...,"(L)-3,4-Dihydroxyphenylalanine ((L)-DOPA) rema..."


#### Some NLP concepts

- Corpus = data frame = set of documents.
- Vocabulary = set of (words).
- Vectorizer = Algorithm to go from corpus to matrix.

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [7]:
vec = TfidfVectorizer(stop_words='english')

In [8]:
vec.fit(df['Abstract'])

TfidfVectorizer(stop_words='english')

In [9]:
vec.vocabulary_

{'dimensional': 35622,
 'non': 78779,
 'linear': 66174,
 'optical': 82286,
 'waves': 121975,
 'coherently': 27018,
 'excited': 43064,
 'resonant': 97851,
 'medium': 70129,
 'doped': 37289,
 'erbium': 41871,
 'atoms': 15302,
 'described': 34287,
 'schrodinger': 102412,
 'equation': 41764,
 'coupled': 30048,
 'self': 103240,
 'induced': 58670,
 'transparency': 115645,
 'equations': 41768,
 'hirota': 53971,
 'method': 71056,
 'symbolic': 111003,
 'computation': 27888,
 'forms': 46661,
 'soliton': 106528,
 'solutions': 106558,
 'obtained': 81050,
 'asymptotic': 15093,
 'analysis': 12134,
 'conducted': 28492,
 'suggests': 109942,
 'interaction': 59746,
 'solitons': 106530,
 'elastic': 39767,
 'bright': 20582,
 'fields': 45179,
 'dark': 32521,
 'ones': 81922,
 'field': 45173,
 'electric': 39842,
 'polarization': 89540,
 'population': 90119,
 'inversion': 60691,
 'profile': 92127,
 'dopant': 37284,
 'head': 52717,
 'bidirectional': 18308,
 'overtaking': 83573,
 'unidirectional': 118390,
 'see

In [10]:
len(vec.vocabulary_)

124529

In [11]:
df.shape

(46985, 7)

In [12]:
X = vec.transform(df['Abstract'])

In [13]:
X.shape

(46985, 124529)

In [14]:
X

<46985x124529 sparse matrix of type '<class 'numpy.float64'>'
	with 4077065 stored elements in Compressed Sparse Row format>

In [15]:
abstract = 'In this paper we study biochemistry of dimensional birds'

In [16]:
vec.transform([abstract]).todense().nonzero()

(array([0, 0, 0, 0, 0], dtype=int64),
 array([ 18569,  19062,  35622,  84515, 109146], dtype=int64))

#### Label encoding

Problem: $X$ is already there, but $y$ is still unstructured data... 

In [17]:
df['Domain'].unique()

array(['CS ', 'Medical ', 'Civil ', 'ECE ', 'biochemistry ', 'MAE ',
       'Psychology  '], dtype=object)

**Two workarounds:**

- One-hot encoding (1 if instance belongs to category, 0 if not). 
- Label encoding: replace labels by integers.

==> Beware of potential bias on label encoding (very often, no problem).

In [18]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [19]:
y = le.fit_transform(df['Domain'])

In [20]:
y

array([0, 4, 1, ..., 4, 0, 6])

In [21]:
le.classes_

array(['CS ', 'Civil ', 'ECE ', 'MAE ', 'Medical ', 'Psychology  ',
       'biochemistry '], dtype=object)

In [22]:
le.inverse_transform([1])

array(['Civil '], dtype=object)

#### Classification models

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC # recommended for text/multi-dimensional problems

In [24]:
clfs = [
    LogisticRegression()
#    SVC()
]

for clf in clfs:
    clf.fit(X,y)
    print(clf.score(X,y))

0.9070554432265616


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
