In [1]:
import pandas as pd
import re
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from nltk.corpus import wordnet
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.svm import SVC,LinearSVC
from sklearn.metrics import accuracy_score

# Read in data
data = pd.read_csv('mbti_1.csv')

In [2]:
# Stratify split to ensure equal distribution of data
train_data,test_data=train_test_split(data,test_size=0.2,random_state=42,stratify=data.type)

In [3]:
# Clean data
def clear_text(data):
    data_length=[]
    cleaned_text=[]
    for sentence in tqdm(data.posts):
        sentence=sentence.lower()
        
#         removing links from text data
        sentence=re.sub('https?://[^\s<>"]+|www\.[^\s<>"]+',' ',sentence)
    
#         removing other symbols
        sentence=re.sub('[^0-9a-z]',' ',sentence)
  
        data_length.append(len(sentence.split()))
        cleaned_text.append(sentence)
    return cleaned_text,data_length

train_data.posts,train_length=clear_text(train_data)
test_data.posts,test_length=clear_text(test_data)

100%|██████████| 6940/6940 [00:03<00:00, 2126.07it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.posts,train_length=clear_text(train_data)
100%|██████████| 1735/1735 [00:00<00:00, 2053.50it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.posts,test_length=clear_text(test_data)


In [4]:
# Lemmatize and create td-idf
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

class Lemmatizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    def __call__(self, sentence):
        return [self.lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in sentence.split() if len(word)>2]

import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
vectorizer=TfidfVectorizer( max_features=5000,stop_words='english',tokenizer=Lemmatizer())
vectorizer.fit(train_data.posts)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/local/lib/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /usr/local/lib/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


TfidfVectorizer(max_features=5000, stop_words='english',
                tokenizer=<__main__.Lemmatizer object at 0x12534dbe0>)

In [5]:
# Convert input data to Document-Term Matrix using tf-idf data
train_post=vectorizer.transform(train_data.posts).toarray()
test_post=vectorizer.transform(test_data.posts).toarray()

In [6]:
# Create labels for outputs
target_encoder=LabelEncoder()
train_target=target_encoder.fit_transform(train_data.type)
test_target=target_encoder.fit_transform(test_data.type)

In [7]:
# Logistic Regression
model_log=LogisticRegression(max_iter=3000,C=0.5,n_jobs=-1)
model_log.fit(train_post,train_target)

LogisticRegression(C=0.5, max_iter=3000, n_jobs=-1)

In [8]:
print('train classification report \n ',classification_report(train_target,model_log.predict(train_post),target_names=target_encoder.inverse_transform([i for i in range(16)])))

train classification report 
                precision    recall  f1-score   support

        ENFJ       0.85      0.15      0.26       152
        ENFP       0.81      0.65      0.72       540
        ENTJ       0.93      0.29      0.44       185
        ENTP       0.82      0.67      0.74       548
        ESFJ       0.00      0.00      0.00        34
        ESFP       0.00      0.00      0.00        38
        ESTJ       0.00      0.00      0.00        31
        ESTP       1.00      0.04      0.08        71
        INFJ       0.74      0.83      0.78      1176
        INFP       0.66      0.93      0.77      1465
        INTJ       0.74      0.81      0.78       873
        INTP       0.69      0.87      0.77      1043
        ISFJ       0.89      0.25      0.39       133
        ISFP       0.86      0.25      0.39       217
        ISTJ       0.83      0.27      0.41       164
        ISTP       0.87      0.53      0.66       270

    accuracy                           0.72      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
print('test classification report \n',classification_report(test_target,model_log.predict(test_post),target_names=target_encoder.inverse_transform([i for i in range(16)])))

test classification report 
               precision    recall  f1-score   support

        ENFJ       1.00      0.08      0.15        38
        ENFP       0.74      0.54      0.62       135
        ENTJ       0.67      0.13      0.22        46
        ENTP       0.65      0.50      0.56       137
        ESFJ       0.00      0.00      0.00         8
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.00      0.00      0.00         8
        ESTP       0.00      0.00      0.00        18
        INFJ       0.65      0.72      0.68       294
        INFP       0.57      0.88      0.69       367
        INTJ       0.64      0.68      0.66       218
        INTP       0.67      0.83      0.74       261
        ISFJ       0.67      0.12      0.21        33
        ISFP       0.92      0.20      0.33        54
        ISTJ       0.50      0.05      0.09        41
        ISTP       0.68      0.45      0.54        67

    accuracy                           0.63      17

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
model_svc=SVC()
model_svc.fit(train_post,train_target)

SVC()

In [11]:
print('train classification report \n ',classification_report(train_target,model_svc.predict(train_post),target_names=target_encoder.inverse_transform([i for i in range(16)])))
print('test classification report \n ',classification_report(test_target,model_svc.predict(test_post),target_names=target_encoder.inverse_transform([i for i in range(16)])))

train classification report 
                precision    recall  f1-score   support

        ENFJ       0.96      0.84      0.90       152
        ENFP       0.95      0.94      0.95       540
        ENTJ       0.98      0.89      0.93       185
        ENTP       0.95      0.96      0.95       548
        ESFJ       1.00      0.59      0.74        34
        ESFP       1.00      0.37      0.54        38
        ESTJ       1.00      0.52      0.68        31
        ESTP       1.00      0.82      0.90        71
        INFJ       0.95      0.96      0.95      1176
        INFP       0.92      0.98      0.95      1465
        INTJ       0.96      0.96      0.96       873
        INTP       0.94      0.97      0.95      1043
        ISFJ       0.99      0.89      0.94       133
        ISFP       0.97      0.88      0.93       217
        ISTJ       0.94      0.91      0.92       164
        ISTP       0.97      0.93      0.95       270

    accuracy                           0.95      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
model_linear_svc=LinearSVC(C=0.1)
model_linear_svc.fit(train_post,train_target)

LinearSVC(C=0.1)

In [13]:
print('train classification report \n ',classification_report(train_target,model_linear_svc.predict(train_post),target_names=target_encoder.inverse_transform([i for i in range(16)])))
print('test classification report \n',classification_report(test_target,model_linear_svc.predict(test_post),target_names=target_encoder.inverse_transform([i for i in range(16)])))

train classification report 
                precision    recall  f1-score   support

        ENFJ       0.91      0.44      0.59       152
        ENFP       0.84      0.76      0.80       540
        ENTJ       0.92      0.64      0.76       185
        ENTP       0.84      0.81      0.82       548
        ESFJ       0.83      0.29      0.43        34
        ESFP       1.00      0.16      0.27        38
        ESTJ       1.00      0.23      0.37        31
        ESTP       0.95      0.51      0.66        71
        INFJ       0.81      0.85      0.83      1176
        INFP       0.77      0.93      0.84      1465
        INTJ       0.83      0.85      0.84       873
        INTP       0.81      0.90      0.85      1043
        ISFJ       0.93      0.65      0.77       133
        ISFP       0.88      0.59      0.71       217
        ISTJ       0.88      0.65      0.75       164
        ISTP       0.89      0.81      0.85       270

    accuracy                           0.82      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
models_accuracy={}
models_accuracy['logistic regression']=accuracy_score(test_target,model_log.predict(test_post))
models_accuracy['Linear Support Vector classifier']=accuracy_score(test_target,model_linear_svc.predict(test_post))
models_accuracy['Support Vector classifier']=accuracy_score(test_target,model_svc.predict(test_post))

In [15]:
print(models_accuracy)

{'logistic regression': 0.631700288184438, 'Linear Support Vector classifier': 0.6645533141210375, 'Support Vector classifier': 0.6426512968299711}


In [41]:
import io
import uvicorn
import numpy as np
import nest_asyncio
from enum import Enum
from fastapi import FastAPI, HTTPException
from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse

In [None]:
# Assign an instance of the FastAPI class to the variable "app".
# You will interact with your api using this instance.
app = FastAPI(title='Deploying a ML Model with FastAPI')

class Model(str, Enum):
    model1 = 'Logistic Regression Classifier'
    model2= 'SVM Classifier'
    model3 = 'Linear SVM Classifier'

# By using @app.get("/") you are allowing the GET method to work for the / endpoint.
@app.get("/")
def home():
    return "Welcome to the personality predictor. Now head over to http://localhost:8000/docs."

# This endpoint handles all the logic necessary for the object detection to work.
# It requires the desired model and the image in which to perform object detection.
@app.post("/predict") 
def prediction(model: Model, sentence: str):
    
    if len(sentence) == 0:
        raise HTTPException(status_code=415, detail="Input must not be empty.")
        
    sentence = [sentence]
    
    def clear_request_text(data):
        cleaned_text=[]
        for sentence in tqdm(data):
            sentence=sentence.lower()

    #         removing links from text data
            sentence=re.sub('https?://[^\s<>"]+|www\.[^\s<>"]+',' ',sentence)

    #         removing other symbols
            sentence=re.sub('[^0-9a-z]',' ',sentence)

            cleaned_text.append(sentence)
        return cleaned_text

    # 1. CLEAR TEXT
    cleared_sentence = clear_request_text(sentence)
    
    # 2. TRANSFORM SENTENCE TO DOCUMENT-TERM MATRIX
    dtm_sentence = vectorizer.transform(cleared_sentence).toarray()
    
    # 3. RUN PERSONALITY PREDICTOR MODEL
    if model == 'Logistic Regression Classifier':
        prediction = target_encoder.inverse_transform([i for i in range(16)])[model_log.predict(dtm_sentence)]
    elif model == 'SVM Classifier':
        prediction = target_encoder.inverse_transform([i for i in range(16)])[model_svc.predict(dtm_sentence)]
    elif model == 'Linear SVM Classifier':
        prediction = target_encoder.inverse_transform([i for i in range(16)])[model_linear_svc.predict(dtm_sentence)]
    
    # 4. RETURN PREDICTION
    return str(prediction)

# Allows the server to be run in this interactive environment
nest_asyncio.apply()

# Host depends on the setup you selected (docker or virtual env)
host = "127.0.0.1"

# Spin up the server!    
uvicorn.run(app, host=host, port=8000)

INFO:     Started server process [87265]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


INFO:     127.0.0.1:58353 - "GET /docs HTTP/1.1" 200 OK
INFO:     127.0.0.1:58353 - "GET /openapi.json HTTP/1.1" 200 OK


100%|██████████| 1/1 [00:00<00:00, 14716.86it/s]

INFO:     127.0.0.1:58353 - "POST /predict?model=Logistic%20Regression%20Classifier&sentence=hi HTTP/1.1" 200 OK



100%|██████████| 1/1 [00:00<00:00, 14027.77it/s]

INFO:     127.0.0.1:58354 - "POST /predict?model=Logistic%20Regression%20Classifier&sentence=i%20like%20reading%20books%20and%20chilling%20on%20the%20vibes HTTP/1.1" 200 OK



100%|██████████| 1/1 [00:00<00:00, 16131.94it/s]

INFO:     127.0.0.1:58374 - "POST /predict?model=Logistic%20Regression%20Classifier&sentence=i%20am%20sad HTTP/1.1" 200 OK



100%|██████████| 1/1 [00:00<00:00, 12052.60it/s]

INFO:     127.0.0.1:58376 - "POST /predict?model=Logistic%20Regression%20Classifier&sentence=i%20am%20happy HTTP/1.1" 200 OK



100%|██████████| 1/1 [00:00<00:00, 6452.78it/s]

INFO:     127.0.0.1:58382 - "POST /predict?model=Logistic%20Regression%20Classifier&sentence=i%20am%20angry HTTP/1.1" 200 OK



100%|██████████| 1/1 [00:00<00:00, 13148.29it/s]

INFO:     127.0.0.1:58389 - "POST /predict?model=Logistic%20Regression%20Classifier&sentence=hi%20i%20feel%20very%20sad%20today HTTP/1.1" 200 OK



100%|██████████| 1/1 [00:00<00:00, 16131.94it/s]

INFO:     127.0.0.1:58397 - "POST /predict?model=SVM%20Classifier&sentence=i%20am%20sad HTTP/1.1" 200 OK



100%|██████████| 1/1 [00:00<00:00, 15650.39it/s]

INFO:     127.0.0.1:58397 - "POST /predict?model=SVM%20Classifier&sentence=i%20am%20happy HTTP/1.1" 200 OK



100%|██████████| 1/1 [00:00<00:00, 16131.94it/s]

INFO:     127.0.0.1:58401 - "POST /predict?model=SVM%20Classifier&sentence=i%20am%20moody HTTP/1.1" 200 OK





INFO:     127.0.0.1:58402 - "GET /docs HTTP/1.1" 200 OK
INFO:     127.0.0.1:58402 - "GET /openapi.json HTTP/1.1" 200 OK


100%|██████████| 1/1 [00:00<00:00, 10131.17it/s]

INFO:     127.0.0.1:58403 - "POST /predict?model=Logistic%20Regression%20Classifier&sentence=today%20i%20feel%20very%20sad HTTP/1.1" 200 OK



100%|██████████| 1/1 [00:00<00:00, 15141.89it/s]

INFO:     127.0.0.1:58417 - "POST /predict?model=Logistic%20Regression%20Classifier&sentence=today%20i%20feel%20very%20angry HTTP/1.1" 200 OK



100%|██████████| 1/1 [00:00<00:00, 10433.59it/s]

INFO:     127.0.0.1:58431 - "POST /predict?model=Logistic%20Regression%20Classifier&sentence=today%20i%20feel%20very%20angry.%20why%20is%20no%20one%20talking%20to%20me. HTTP/1.1" 200 OK



100%|██████████| 1/1 [00:00<00:00, 3663.15it/s]

INFO:     127.0.0.1:58456 - "POST /predict?model=Logistic%20Regression%20Classifier&sentence=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DqsXHcwe3krw%7C%7C%7Chttp%3A%2F%2F41.media.tumblr.com%2Ftumblr_lfouy03PMA1qa1rooo1_500.jpg%7C%7C%7Cenfp%20and%20intj%20moments%20%20https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Diz7lE1g4XM4%20%20sportscenter%20not%20top%20ten%20plays%20%20https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DuCdfze1etec%20%20pranks%7C%7C%7CWhat%20has%20been%20the%20most%20life-changing%20experience%20in%20your%20life%3F%7C%7C%7Chttp%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DvXZeYwwRDw8%20%20%20http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Du8ejam5DP3E%20%20On%20repeat%20for%20most%20of%20today.%7C%7C%7CMay%20the%20PerC%20Experience%20immerse%20you.%7C%7C%7CThe%20last%20thing%20my%20INFJ%20friend%20posted%20on%20his%20facebook%20before%20committing%20suicide%20the%20next%20day.%20Rest%20in%20peace~%20%20%20http%3A%2F%2Fvimeo.com%2F22842206%7C%7C%7CHello%20ENFJ7.%20Sorry%20to%20hear%20of%20y


100%|██████████| 1/1 [00:00<00:00, 13148.29it/s]

INFO:     127.0.0.1:58456 - "POST /predict?model=Logistic%20Regression%20Classifier&sentence=today%20i%20feel%20very%20angry.%20why%20is%20no%20one%20talking%20to%20me. HTTP/1.1" 200 OK





In [34]:
# Allows the server to be run in this interactive environment
nest_asyncio.apply()

# Host depends on the setup you selected (docker or virtual env)
host = "127.0.0.1"

# Spin up the server!    
uvicorn.run(app, host=host, port=8000)

INFO:     Started server process [87265]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


INFO:     127.0.0.1:57798 - "GET /docs HTTP/1.1" 200 OK
INFO:     127.0.0.1:57798 - "GET /openapi.json HTTP/1.1" 200 OK


100%|██████████| 1/1 [00:00<00:00, 14463.12it/s]

INFO:     127.0.0.1:57799 - "POST /predict?model=2&sentence=hi%20i%20woke%20up%20feeling%20great%20today%20thank%20you%20all HTTP/1.1" 500 Internal Server Error



ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "/Library/anaconda3/envs/ml/lib/python3.9/site-packages/uvicorn/protocols/http/h11_impl.py", line 373, in run_asgi
    result = await app(self.scope, self.receive, self.send)
  File "/Library/anaconda3/envs/ml/lib/python3.9/site-packages/uvicorn/middleware/proxy_headers.py", line 75, in __call__
    return await self.app(scope, receive, send)
  File "/Library/anaconda3/envs/ml/lib/python3.9/site-packages/fastapi/applications.py", line 208, in __call__
    await super().__call__(scope, receive, send)
  File "/Library/anaconda3/envs/ml/lib/python3.9/site-packages/starlette/applications.py", line 112, in __call__
    await self.middleware_stack(scope, receive, send)
  File "/Library/anaconda3/envs/ml/lib/python3.9/site-packages/starlette/middleware/errors.py", line 181, in __call__
    raise exc
  File "/Library/anaconda3/envs/ml/lib/python3.9/site-packages/starlette/middleware/errors.py", line 159, in __ca

INFO:     127.0.0.1:57803 - "POST /predict?model=2&sentence=hi%20i%20woke%20up%20feeling%20great%20today%20thank%20you%20all HTTP/1.1" 500 Internal Server Error



ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "/Library/anaconda3/envs/ml/lib/python3.9/site-packages/uvicorn/protocols/http/h11_impl.py", line 373, in run_asgi
    result = await app(self.scope, self.receive, self.send)
  File "/Library/anaconda3/envs/ml/lib/python3.9/site-packages/uvicorn/middleware/proxy_headers.py", line 75, in __call__
    return await self.app(scope, receive, send)
  File "/Library/anaconda3/envs/ml/lib/python3.9/site-packages/fastapi/applications.py", line 208, in __call__
    await super().__call__(scope, receive, send)
  File "/Library/anaconda3/envs/ml/lib/python3.9/site-packages/starlette/applications.py", line 112, in __call__
    await self.middleware_stack(scope, receive, send)
  File "/Library/anaconda3/envs/ml/lib/python3.9/site-packages/starlette/middleware/errors.py", line 181, in __call__
    raise exc
  File "/Library/anaconda3/envs/ml/lib/python3.9/site-packages/starlette/middleware/errors.py", line 159, in __ca