# sentiment analysis


In [1]:
from textblob import TextBlob

In [12]:
text = "are you crazy"
tb = TextBlob(text)
polarity = tb.sentiment.polarity
if polarity>0.2:
    print("this is a good comment")
elif polarity < -0.2:
    print("this is a bad comment")
else: 
    print("not clear")
print(polarity)
    

this is a bad comment
-0.6


# Text generation¶

In [14]:
!pip install transformers




In [2]:
import tensorflow as tf
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")

In [4]:
model = GPT2LMHeadModel.from_pretrained("gpt2-large", pad_token_id=tokenizer.eos_token_id)

Downloading:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

In [12]:
sentence = 'so good to be back'
input_ids = tokenizer.encode(sentence, return_tensors='pt')

In [13]:
input_ids

tensor([[568, 922, 284, 307, 736]])

In [17]:
# generate text until the output length (which includes the context length) reaches 50
output = model.generate(input_ids, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)

In [18]:
output

tensor([[  568,   922,   284,   307,   736,   526,   198,   198,     1,  1026,
           338,   587,   257,   890,   640,  1201,   314,  1053,   587,   736,
           553,   339,   531,    13,   366,    40,  1101,  9675,   314,  1392,
           284,   466,   340,    13,   632,   373,   257,  1256,   286,  1257,
            13,   314,  1101,  2045,  2651,   284,   262,  1334,   286,   262,
          1622,   526, 50256]])

In [19]:
print(tokenizer.decode(output[0], skip_special_tokens=True))

so good to be back."

"It's been a long time since I've been back," he said. "I'm glad I got to do it. It was a lot of fun. I'm looking forward to the rest of the season."


# text summarization

In [20]:
from transformers import pipeline

In [21]:
#Load Summarization Pipeline
summarizer = pipeline("summarization")

Downloading:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [29]:
#Summarize Text
ARTICLE = """Perhaps one of the most significant advances made by Arabic mathematics began at this time with the work of al-Khwarizmi, namely 
the beginnings of algebra. It is important to understand just how significant this new idea was. It was a revolutionary move away from 
the Greek concept of mathematics which was essentially geometry. Algebra was a unifying theory which allowed rational 
numbers, irrational numbers, geometrical magnitudes, etc., to all be treated as "algebraic objects". It gave mathematics a whole new 
development path so much broader in concept to that which had existed before, and provided a vehicle for future development of the 
subject. Another important aspect of the introduction of algebraic ideas was that it allowed mathematics to be applied to itself in a 
way which had not happened before.
"""

In [30]:
summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False)


[{'summary_text': ' Algebra was a unifying theory which allowed rational numbers, irrational numbers, geometrical magnitudes, etc., to all be treated as "algebraic objects" It gave mathematics a whole new development path so much broader in concept to that which had existed before .'}]

# Name entity recognition using spacy


In [31]:
import spacy 
from spacy import displacy

In [32]:
#Download spacy models
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.1.0

2021-08-14 00:35:33.630540: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2021-08-14 00:35:33.648989: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.



  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [33]:
# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

In [40]:
text = "Apple acquired Zoom in China on Wednesday 6th May 2020.\
This news has made Apple and Google stock jump by 5% on Dow Jones Index in the \
United States of America"
import pandas as pd

In [41]:
doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels,'Position_Start':position_start, 'Position_End':position_end})

df

Unnamed: 0,Entities,Labels,Position_Start,Position_End
0,(Apple),ORG,0,5
1,(Zoom),ORG,15,19
2,(China),GPE,23,28
3,"(Wednesday, 6th)",DATE,32,45
4,(Apple),ORG,74,79
5,"(5, %)",PERCENT,105,107
6,"(Dow, Jones)",ORG,111,120
7,"(the, United, States, of, America)",GPE,130,158


In [46]:
spacy.explain("ORG")


'Companies, agencies, institutions, etc.'

In [45]:
spacy.explain("GPE")

'Countries, cities, states'

# translation

In [16]:
from transformers import MarianMTModel, MarianTokenizer

In [22]:
src_text = "Hi everyone I hope you're doing ok"

In [18]:
model_name = 'Helsinki-NLP/opus-mt-en-roa'

In [19]:
tokenizer = MarianTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/786k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/793k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

In [20]:
print(tokenizer.supported_language_codes)

['>>zlm_Latn<<', '>>mfe<<', '>>hat<<', '>>pap<<', '>>ast<<', '>>cat<<', '>>ind<<', '>>glg<<', '>>wln<<', '>>spa<<', '>>fra<<', '>>ron<<', '>>por<<', '>>ita<<', '>>oci<<', '>>arg<<', '>>min<<']


In [21]:
model = MarianMTModel.from_pretrained(model_name)

Downloading:   0%|          | 0.00/295M [00:00<?, ?B/s]

In [23]:
translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ..\aten\src\ATen\native\BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


In [25]:
[tokenizer.decode(t, skip_special_tokens=True) for t in translated]

['Hi a todos los que espero que te trobes bien']

# Feature Extraction

In [26]:
measurements = [
    {'city': 'Dubai', 'temperature': 33.},
    {'city': 'London', 'temperature': 12.},
    {'city': 'San Fransisco', 'temperature': 18.},
]

In [27]:
from sklearn.feature_extraction import DictVectorizer

In [28]:
vec = DictVectorizer()

vec.fit_transform(measurements).toarray()

array([[ 1.,  0.,  0., 33.],
       [ 0.,  1.,  0., 12.],
       [ 0.,  0.,  1., 18.]])

In [29]:
vec.get_feature_names()

['city=Dubai', 'city=London', 'city=San Fransisco', 'temperature']

# Filling masked text

In [30]:
from transformers import pipeline

In [31]:
unmasker = pipeline('fill-mask', model='bert-base-cased')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [33]:
unmasker("are you [MASK] of you're mind")

[{'sequence': "are you out of you're mind",
  'score': 0.3947642147541046,
  'token': 1149,
  'token_str': 'out'},
 {'sequence': "are you sure of you're mind",
  'score': 0.049038831144571304,
  'token': 1612,
  'token_str': 'sure'},
 {'sequence': "are you afraid of you're mind",
  'score': 0.03349035978317261,
  'token': 3737,
  'token_str': 'afraid'},
 {'sequence': "are you, of you're mind",
  'score': 0.02809295617043972,
  'token': 117,
  'token_str': ','},
 {'sequence': "are you aware of you're mind",
  'score': 0.019816407933831215,
  'token': 4484,
  'token_str': 'aware'}]

# Question answering

In [36]:
from transformers import pipeline

nlp = pipeline("question-answering")

context = r"""
Barack Obama né le 4 août 1961 à Honolulu (Hawaï), est un homme d'État américain. Il est le 44e président des États-Unis, en fonction du 20 janvier 2009 au 20 janvier 2017.
"""

print(nlp(question="qui est obama?", context=context))

{'score': 0.03488485887646675, 'start': 52, 'end': 81, 'answer': "est un homme d'État américain"}
