## Module list

In [3]:
# https://docs.python.org/3/library/index.html

# https://www.w3schools.com/python/python_ref_functions.asp

# text processing services
import re
# data types
import datetime
import pprint
# numeric and maths
import random
# functional programming
import itertools
import functools
# file and directory access
import pathlib
import os.path
# data persistence
import pickle
# data compression and archiving
import zipfile
# generic operating system
import os
import io
import time
# internet data handling
import json
import base64
# structured markup processing tool
import xml.etree.ElementTree
import xml.dom.minidom
# internet protocols and support
import webbrowser
import urllib.request
# debugging and profiling
import timeit
# python runtime services
import sys

# https://pypi.org/project/pytz/
import pytz

# https://lxml.de/tutorial.html
from lxml import etree

# https://ipython.readthedocs.io/en/8.16.0/api/generated/IPython.display.html
from IPython.display import Image

import chromadb

import pythainlp
import nltk

### JSON dumps

In [14]:
## read json file
#1 load
#2 dumps (json to strinng)

with open('data/example.json') as f:
  data_json = json.load(f)

print(json.dumps(data_json,indent=4))


{
    "test1": {
        "email": "angela@gmail.com",
        "password": "!b(BVRZ9m$vr+8"
    },
    "test2": {
        "email": "angela@gmail.com",
        "password": "eH12GA##5&vIRg!b(BVRZ9m$vr+8"
    },
    "test3": {
        "email": "angela@gmail.com",
        "password": "#wpzRt#yFQ5E3"
    },
    "testn": {
        "email": "angela@gmail.com",
        "password": "fy(0(Ek%9BQY$h"
    }
}


### pprint

In [15]:
import pprint

pprint.pprint(data_json, indent=4, width=40)

{   'test1': {   'email': 'angela@gmail.com',
                 'password': '!b(BVRZ9m$vr+8'},
    'test2': {   'email': 'angela@gmail.com',
                 'password': 'eH12GA##5&vIRg!b(BVRZ9m$vr+8'},
    'test3': {   'email': 'angela@gmail.com',
                 'password': '#wpzRt#yFQ5E3'},
    'testn': {   'email': 'angela@gmail.com',
                 'password': 'fy(0(Ek%9BQY$h'}}


In [16]:
data_dict = json.dumps(data_json,indent=4)
print(data_dict)

{
    "test1": {
        "email": "angela@gmail.com",
        "password": "!b(BVRZ9m$vr+8"
    },
    "test2": {
        "email": "angela@gmail.com",
        "password": "eH12GA##5&vIRg!b(BVRZ9m$vr+8"
    },
    "test3": {
        "email": "angela@gmail.com",
        "password": "#wpzRt#yFQ5E3"
    },
    "testn": {
        "email": "angela@gmail.com",
        "password": "fy(0(Ek%9BQY$h"
    }
}


### pathlib

- create path object: pathlib.path('...')
- join path to file: <path> / <filename.xxx>

In [17]:
import pathlib
import os.path

path = pathlib.Path('new_directory')

if not path.exists():
    path.mkdir()      # create path

file_path = path / 'sample_file.txt'   # join path with file name
with open(file_path, 'w') as f:
    f.write("test create new file")

if os.path.isfile(file_path):    # check file exist or not
    print(f"file name {file_path} exists")
    with open(file_path) as f:
        print(f.read())


file name new_directory\sample_file.txt exists
test create new file


In [25]:
# path = pathlib.Path('')
path = pathlib.Path()     # home path
print(path)

file_path = path / 'note.txt'
with open(file_path) as f:
    print(f.read())

.
zzzzz


In [None]:
# path = pathlib.Path('')
path = pathlib.Path('new_directory/new_subdirectory')
print(path)  # output as OS

file_path = path / 'sample_insub.txt'   # '/' ==> better join, ignore OS
with open(file_path) as f:
    print(f.read())

new_directory\new_subdirectory
this is sample in sub


#### read

- read()	
  - A single string of all content	
  - For reading small files entirely at once
- readline()	
  - A single line at a time	
  - For reading large files line-by-line
- readlines()	
  - A list of all lines	
  - For processing all lines in list form

### zipfile

In [19]:
import zipfile

In [None]:
# with zipfile.ZipFile('data/blognone.zip', 'w', compression =zipfile.ZIP_DEFLATED) as zipf:

In [None]:
with zipfile.ZipFile('gitignore/example.zip', 'w') as zipf:
    zipf.write('gitignore/Syllabus67.1 2603655 v2.pdf')

In [None]:
with zipfile.ZipFile('gitignore/example2.zip', 'w') as zipf:
    zipf.write('gitignore/test1.txt')
    zipf.write('gitignore/test2.txt')

In [None]:
# zipfile.writestr('<filename>', stringdata)
with zipfile.ZipFile('gitignore/example3.zip', 'w') as zipf:
    # Write a string to a file inside the zip
    zipf.writestr('file1.txt', 'This is the content of file1.')
    # Write another string to a different file
    zipf.writestr('file2.txt', 'This is the content of file2.')


In [None]:
with zipfile.ZipFile('gitignore/example2.zip', 'r') as zipf:
    zipf.extractall('gitignore/DestinationFolder')

In [None]:
with zipfile.ZipFile('gitignore/example2.zip', 'r') as zipf:
    print(zipf.namelist())   # list

['gitignore/test1.txt', 'gitignore/test2.txt']


In [None]:
with zipfile.ZipFile('gitignore/example2.zip', 'r') as zipf:
    with zipf.open('gitignore/test1.txt') as file:
        content = file.read()
        print(content)

b'zip file test1'


### directory

#### directory by defined function

In [None]:
from pathlib import Path

def print_directory_tree(root, indent=""):
    root_path = Path(root)
    items = sorted(root_path.iterdir(), key=lambda x: (x.is_file(), x.name))
    
    for i, item in enumerate(items):
        connector = "└──" if i == len(items) - 1 else "├──"
        print(f"{indent}{connector} {item.name}")
        
        if item.is_dir():
            new_indent = indent + ("    " if i == len(items) - 1 else "│   ")
            print_directory_tree(item, new_indent)


In [None]:
print_directory_tree("perplexity")


├── .ipynb_checkpoints
│   ├── Perplex01_List-checkpoint.ipynb
│   ├── Perplex02_Set-checkpoint.ipynb
│   ├── Perplex03_Dictionary-checkpoint.ipynb
│   ├── Perplex04_Map_Zip-checkpoint.ipynb
│   ├── Perplex05_Pandas_beginner-checkpoint.ipynb
│   ├── Perplex06_Pandas_advanced-checkpoint.ipynb
│   ├── Perplex07_DataVis-checkpoint.ipynb
│   ├── Perplex08_DataCleaning_intro-checkpoint.ipynb
│   ├── Perplex09_MissingValue-checkpoint.ipynb
│   ├── Perplex10_Outlier-checkpoint.ipynb
│   ├── Perplex11_Standardization-checkpoint.ipynb
│   ├── Perplex12_DimensionReduction-checkpoint.ipynb
│   └── Perplex13_Serialization-checkpoint.ipynb
├── Perplex01_List.ipynb
├── Perplex02_Set.ipynb
├── Perplex03_Dictionary.ipynb
├── Perplex04_Map_Zip.ipynb
├── Perplex05_Pandas_beginner.ipynb
├── Perplex06_Pandas_advanced.ipynb
├── Perplex07_DataVis.ipynb
├── Perplex08_DataCleaning_intro.ipynb
├── Perplex09_MissingValue.ipynb
├── Perplex10_Outlier.ipynb
├── Perplex11_Standardization.ipynb
├── Perplex12_Dimensi

#### `directory_tree`

https://pypi.org/project/directory-tree/

DisplayTree(
    dirPath: str='',
    stringRep: bool=False,
    header: bool=False,
    maxDepth: float=float('inf'),
    showHidden: bool=False,
    ignoreList: List[str]=None,
    onlyFiles: bool=False,
    onlyDirs: bool=False,
    sortBy: int=0
) -> Union[str, None]:

In [None]:
from directory_tree import DisplayTree
DisplayTree("perplexity")

perplexity/
├── Perplex01_List.ipynb
├── Perplex02_Set.ipynb
├── Perplex03_Dictionary.ipynb
├── Perplex04_Map_Zip.ipynb
├── Perplex05_Pandas_beginner.ipynb
├── Perplex06_Pandas_advanced.ipynb
├── Perplex07_DataVis.ipynb
├── Perplex08_DataCleaning_intro.ipynb
├── Perplex09_MissingValue.ipynb
├── Perplex10_Outlier.ipynb
├── Perplex11_Standardization.ipynb
├── Perplex12_DimensionReduction.ipynb
└── Perplex13_Serialization.ipynb


In [None]:
DisplayTree('data')

data/
├── aansnook_books.json
├── blognone.json
├── blognone.zip
├── BOW Vect.txt
├── bread_production_data.csv
├── cartoon1.jpg
├── cartoon2.jpg
├── churn_data.csv
├── content.txt
├── csv_json_trans.csv
├── csv_json_trans.csv.zip
├── csv_trans_tab.csv
├── custom_words.json
├── data.xml
├── eurofxref-daily.xml
├── example.json
├── example_xml.xml
├── historyExchange5Days.zip
├── house_data.csv
├── json0.json
├── json1.json
├── json2.json
├── json3.json
├── json4.json
├── khaosod.json
├── khaosod.zip
├── LLR.txt
├── lumphini_park.jpg
├── persons.json
├── persons.xml
├── persons0.json
├── plot_count_tfidf.txt
├── regex example.csv
├── restaurant_ratings.csv
├── smart_city.csv
├── test.html
├── thaiglobal_logistics_data.csv
├── thaiglobal_logistics_data_rev2.csv
├── thaisky_airways_bookings.csv
├── tham_luang_rescue_data.csv
├── tham_luang_rescue_data.xlsx
├── tips.csv
├── traffic_data.csv
├── ultra_easy_vector_db.txt
└── weather655.xml


### format

In [None]:
start = 1
end = 100
step = 10

for i in range(start, end + 1, step):
    range_label = f"{str(i).zfill(3)}-{str(min(i + step - 1, end)).zfill(3)}"
    print(range_label)


### to_json

#### `df.to_json`(orient='table)
- in order to open json witht not simple json format

In [None]:
import json
import pandas as pd

# df_blognone = pd.read_json('data/blognone.json')
df_blognone = pd.read_json('data/blognone.json', orient ='table')
df_blognone.head(3)

Unnamed: 0,title,content
0,รีวิว ASUS ROG Ally X เทียบ ROG Ally รุ่นแรก ค...,ในช่วง 2-3 ปีที่ผ่านมา วงการเกมมีความเคลื่อนไห...
1,Meta นำ Facial Recognition มาใช้งานอีกครั้งหลั...,Meta เตรียมนำระบบรู้จำใบหน้าหรือ Facial Recogn...
2,Google เตรียมปิดการทำงานกล่อง Search ย่อยเจาะจ...,กูเกิลแจ้งการปิดการทำงานกล่องค้นหาย่อยในผลค้นห...


#### `df.to_json`(force_acsii)


- `force_ascii` = True
  - for convert non-ascii (thai font) to ascii

In [6]:
data = {'column1': ['value1', 'ค่าใช้จ่าย'], 'column2': [1, 2]}
df = pd.DataFrame(data)

# Convert to JSON with force_ascii=True
json_data = df.to_json(force_ascii=True)
json_data

'{"column1":{"0":"value1","1":"\\u0e04\\u0e48\\u0e32\\u0e43\\u0e0a\\u0e49\\u0e08\\u0e48\\u0e32\\u0e22"},"column2":{"0":1,"1":2}}'

- `force_ascii` = False

In [None]:
# df_corpus.to_json('data/khaosod.json',orient='table',force_ascii=False,indent=2)  

# title	content
# 0	หลาก&หลายไอที - อลังการไลน์อัพโน้ตบุ๊กAI เลอโ...	เลอโนโวเปิดตัวผลิตภัณฑ์ใหม่ล่าสุดของไลน์อัพ Le...

### BeautifulSoup

In [None]:
# html_string = html_file.read().decode('utf-8')
# html_soup = BeautifulSoup(html_string,'html.parser')

# soup = BeautifulSoup(response.text, 'html.parser')

element = soup.find(name, attrs, recursive, string, **kwargs)

In [None]:
result = soup.find('a')     # tagname
result = soup.find('div', class_='container') # tagname and class
result = soup.find('a', href='/about') # tagname and attribute
result = soup.find('p', string='Hello World')  # tagname and specific string



# PythaiNLP

## word_tokenize
- ตัดคำ

In [1]:
from pythainlp.tokenize import word_tokenize

text = "สวัสดีครับ"
words = word_tokenize(text)
print(words)

['สวัสดี', 'ครับ']


## Tokenizer
- create tokenizer object from class

In [None]:
# https://pythainlp.org/docs/4.0/
from pythainlp.tokenize import Tokenizer

# tokenizer = Tokenizer(custom_dict=custom_words,engine='newmm')
tokenizer = Tokenizer()
text = "สวัสดีครับ"
words = tokenizer.word_tokenize(text)
print(words)

['สวัสดี', 'ครับ']


## Error
https://www.w3schools.com/python/python_ref_exceptions.asp


## Keyword
https://www.w3schools.com/python/python_ref_keywords.asp

In [None]:
from pythainlp.tokenize import word_tokenize    # word_tokenize  -> split thai text to word
from pythainlp.tokenize import Tokenizer        # Tokenizer -> create reusable tokenizer object


from pythainlp.corpus import thai_words         # thai_words -> get word



from nltk import ngrams         # ngrams -> generate ngrams as specified number



from collections import Counter     # Counter -> count element in hashable object


from pythainlp.util import dict_trie    # dict_trie -> create prefix tree structure

from sklearn.feature_extraction.text import CountVectorizer  # count word (bow) and put in matrix
from sklearn.feature_extraction.text import TfidfVectorizer  # put in matri of TF-IDF


# step for tokenize bigram by LLR
# - generate 2-grams
# - count occurence
# - calculate LLR
# - filter significant bigram



In [None]:
import chromadb

from sentence_transformers import SentenceTransformer

from chromadb.utils import embedding_functions


In [None]:
# Top-level elements
root.findall(".")

# All 'neighbor' grand-children of 'country' children of the top-level
# elements
root.findall("./country/neighbor")

# Nodes with name='Singapore' that have a 'year' child
root.findall(".//year/..[@name='Singapore']")

# 'year' nodes that are children of nodes with name='Singapore'
root.findall(".//*[@name='Singapore']/year")

# All 'neighbor' nodes that are the second child of their parent
root.findall(".//neighbor[2]")